From 1d9ae1bba2729525af5c1030d532dd1b97de2707 Mon Sep 17 00:00:00 2001 From: suzuke Date: Sat, 25 Apr 2026 22:34:35 +0800 Subject: [PATCH 1/3] feat(m2): reporter compare mode + `crucible compare --html` (M2 PR 11) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Side-by-side static HTML for two ledgers — useful for "greedy vs bfts-lite on the same example" demo-gate comparisons. Strict read-only: no orchestrator changes, no ledger mutation, no config normalization. Renderer: `crucible.reporter.compare.render_comparison_html(left, right, *, left_label, right_label, …)`. Reuses html_tree's `_render_tree` / `_render_summary` / `_best_node_id` / `_color_for` so the per-side cards look identical to the single-view report. CLI: `crucible compare a b --html [--html-out PATH]` writes `/reports/compare-a-vs-b.html` by default. `--right-project DIR` opts into cross-project comparison (e.g. compress-greedy workspace vs compress-bfts workspace from M1b demo gate). Cross-project default output is cwd to avoid writing into the wrong project. Reviewer round 1 verdict: ACCEPT with constraints — all addressed: - Missing data → "n/a" / empty panel, never silently zero - Δ line shown ONLY when both sides agree on metric direction (and both bests exist); otherwise omitted (no auto-winner verdict) - Output path: explicit `--html-out` or predictable default - Strict read-only: no writes anywhere outside the report file - Renderer extraction: kept html_tree.py as stable single-view facade, compare.py imports underscore helpers without changing their API Tests: 11 new in test_reporter_compare.py + 4 new CLI tests in test_cli.py. Full suite: 2413 passed / 4 skipped, 0 regressions over M2 PR 10 baseline (2397). Co-Authored-By: Claude Opus 4.7 (1M context) --- src/crucible/cli.py | 133 ++++++++++++- src/crucible/reporter/__init__.py | 3 +- src/crucible/reporter/compare.py | 235 +++++++++++++++++++++++ tests/test_cli.py | 92 +++++++++ tests/test_reporter_compare.py | 309 ++++++++++++++++++++++++++++++ 5 files changed, 769 insertions(+), 3 deletions(-) create mode 100644 src/crucible/reporter/compare.py create mode 100644 tests/test_reporter_compare.py diff --git a/src/crucible/cli.py b/src/crucible/cli.py index 6ad97cd..f4a6947 100644 --- a/src/crucible/cli.py +++ b/src/crucible/cli.py @@ -691,8 +691,34 @@ def history(tag: str, last: int, project_dir: str, as_json: bool, fmt: str) -> N @main.command(help=_("Compare two experiment runs side by side.")) @click.argument("tags", nargs=2) @click.option("--project-dir", default=".", help=_("Project root directory.")) +@click.option("--right-project", default=None, + help=_("Project root for the SECOND tag (for cross-project compare). " + "If omitted, both tags are read from --project-dir.")) @click.option("--json", "as_json", is_flag=True, help=_("Output as JSON.")) -def compare(tags: tuple[str, str], project_dir: str, as_json: bool) -> None: +@click.option("--html", "html_output", is_flag=True, + help=_("Render side-by-side HTML comparison from ledger.jsonl files. " + "M2 PR 11.")) +@click.option("--html-out", default=None, + help=_("Output path for the HTML report " + "(default: /reports/compare--vs-.html).")) +def compare(tags: tuple[str, str], project_dir: str, right_project: str | None, + as_json: bool, html_output: bool, html_out: str | None) -> None: + tag_a, tag_b = tags + + if html_output: + _render_compare_html( + tag_a, tag_b, + project_dir=project_dir, + right_project_dir=right_project, + html_out=html_out, + ) + return + + if right_project is not None: + raise click.ClickException( + _("--right-project is currently only supported with --html") + ) + try: project = Path(project_dir).resolve() config = load_config(project) @@ -727,7 +753,6 @@ def compare(tags: tuple[str, str], project_dir: str, as_json: bool) -> None: click.echo(json_module.dumps(comparison)) return - tag_a, tag_b = tags col_w = max(len(tag_a), len(tag_b), 12) click.echo(f"{'':>16} {tag_a:>{col_w}} {tag_b:>{col_w}}") for key in ("iterations", "kept", "discarded", "crashed", "best_metric", "best_commit"): @@ -737,6 +762,110 @@ def compare(tags: tuple[str, str], project_dir: str, as_json: bool) -> None: click.echo(f"{label:>16} {str(va):>{col_w}} {str(vb):>{col_w}}") +def _build_metric_lookup(results_path: Path) -> dict[str, float]: + """Build attempt_id → metric_value map from a results-.jsonl file. + + Mirrors the same id derivation used by `crucible postmortem --html`. + Returns {} if the file is missing or unreadable (best-effort). + """ + metric_lookup: dict[str, float] = {} + if not results_path.exists(): + return metric_lookup + try: + with results_path.open() as fp: + for i, line in enumerate(fp, start=1): + rec = json_module.loads(line) + if rec.get("metric_value") is None: + continue + beam_id = rec.get("beam_id") + iteration = rec.get("iteration", i) + attempt_id = ( + f"n{iteration:06d}" if beam_id is None + else f"b{beam_id}n{iteration:06d}" + ) + metric_lookup[attempt_id] = float(rec["metric_value"]) + except Exception as exc: + logging.getLogger(__name__).warning( + "could not build metric_lookup from %s: %s", results_path, exc + ) + return metric_lookup + + +def _render_compare_html( + tag_a: str, + tag_b: str, + *, + project_dir: str, + right_project_dir: str | None, + html_out: str | None, +) -> None: + """Render `crucible compare --html` output. Strict read-only.""" + from crucible.reporter import render_comparison_html + + left_project = Path(project_dir).resolve() + right_project = ( + Path(right_project_dir).resolve() if right_project_dir else left_project + ) + cross_project = right_project != left_project + + left_ledger = left_project / "logs" / f"run-{tag_a}" / "ledger.jsonl" + right_ledger = right_project / "logs" / f"run-{tag_b}" / "ledger.jsonl" + for label, path in (("left", left_ledger), ("right", right_ledger)): + if not path.exists(): + raise click.ClickException( + _("ledger not found for {label} side: {path}").format( + label=label, path=path + ) + ) + + # Per-side metric direction: read each project's config independently. + # If a config is missing/unreadable, pass None → renderer omits Δ. + left_dir = _safe_read_metric_direction(left_project) + right_dir = _safe_read_metric_direction(right_project) + + left_metrics = _build_metric_lookup(left_project / results_filename(tag_a)) + right_metrics = _build_metric_lookup(right_project / results_filename(tag_b)) + + title = ( + f"Crucible Compare — {tag_a} (left) vs {tag_b} (right)" + if not cross_project + else f"Crucible Compare — {left_project.name}:{tag_a} vs {right_project.name}:{tag_b}" + ) + + out = render_comparison_html( + left_ledger, + right_ledger, + left_label=tag_a, + right_label=tag_b, + title=title, + left_metric_lookup=left_metrics, + right_metric_lookup=right_metrics, + left_direction=left_dir, + right_direction=right_dir, + ) + + if html_out: + target = Path(html_out) + elif cross_project: + target = Path.cwd() / f"compare-{tag_a}-vs-{tag_b}.html" + else: + reports_dir = left_project / "reports" + reports_dir.mkdir(exist_ok=True) + target = reports_dir / f"compare-{tag_a}-vs-{tag_b}.html" + + target.write_text(out) + click.echo(_("Wrote HTML comparison to {path}").format(path=target)) + + +def _safe_read_metric_direction(project: Path) -> str | None: + """Return `metric.direction` from a project config, or None on failure.""" + try: + cfg = load_config(project) + return cfg.metric.direction + except (ConfigError, FileNotFoundError, OSError): + return None + + @main.command(help=_("Generate a new experiment from a natural language description.")) @click.argument("dest", type=click.Path()) @click.option("--describe", default=None, help=_("Experiment description (skip interactive prompt).")) diff --git a/src/crucible/reporter/__init__.py b/src/crucible/reporter/__init__.py index bcc107c..5b427fa 100644 --- a/src/crucible/reporter/__init__.py +++ b/src/crucible/reporter/__init__.py @@ -8,6 +8,7 @@ M3 will add d3.js interactive expand/collapse. """ +from crucible.reporter.compare import render_comparison_html from crucible.reporter.html_tree import render_static_html -__all__ = ["render_static_html"] +__all__ = ["render_static_html", "render_comparison_html"] diff --git a/src/crucible/reporter/compare.py b/src/crucible/reporter/compare.py new file mode 100644 index 0000000..6c3ca43 --- /dev/null +++ b/src/crucible/reporter/compare.py @@ -0,0 +1,235 @@ +"""Side-by-side comparison renderer — M2 PR 11. + +Produces a single static HTML doc that places two ledger trees in two +columns. Useful for "greedy vs bfts-lite on the same example" demo-gate +comparisons. + +Strict read-only: never writes to ledgers, never normalises config. If a +side has missing metadata (no metrics, no cost), the column renders +"n/a" rather than failing the whole comparison. +""" + +from __future__ import annotations + +import html +from datetime import datetime +from pathlib import Path +from typing import Sequence + +from crucible.ledger import AttemptNode, TrialLedger +from crucible.reporter.html_tree import ( + _CSS, + _best_node_id, + _color_for, + _format_cost, + _render_summary, + _render_tree, +) + + +def render_comparison_html( + left_ledger_path: Path | str, + right_ledger_path: Path | str, + *, + left_label: str, + right_label: str, + title: str = "Crucible Compare", + left_metric_lookup: dict[str, float] | None = None, + right_metric_lookup: dict[str, float] | None = None, + left_direction: str | None = None, + right_direction: str | None = None, +) -> str: + """Render a side-by-side comparison as a self-contained HTML document. + + Args: + left_ledger_path / right_ledger_path: paths to ledger.jsonl files. + left_label / right_label: user-facing labels for each side + (e.g. "greedy", "bfts-lite", or a tag name). + title: top-of-page title. + left_metric_lookup / right_metric_lookup: per-side `attempt_id → + metric_value` maps. Independent because two runs may use + different metric scales (rare but legal). + left_direction / right_direction: per-side metric direction + ("maximize" / "minimize"). If both sides agree (and both are + non-None), a Δ best-metric line is rendered. If they differ + or either is None, the Δ is omitted. + + Returns: + Complete HTML document (UTF-8 string). + + Missing-data behaviour: an unreadable ledger raises (file errors are + not silent). An empty ledger renders an "(empty)" panel on that + side. Missing metric_lookup → no metric line on cards, no Δ. + """ + left_nodes = _safe_load_nodes(left_ledger_path) + right_nodes = _safe_load_nodes(right_ledger_path) + + left_metrics = left_metric_lookup or {} + right_metrics = right_metric_lookup or {} + + left_best_id = _best_node_id(left_nodes, left_metrics, left_direction or "maximize") + right_best_id = _best_node_id(right_nodes, right_metrics, right_direction or "maximize") + + delta_line = _render_delta( + left_best_id, right_best_id, + left_metrics, right_metrics, + left_direction, right_direction, + ) + + left_section = _render_side( + left_label, left_nodes, left_metrics, left_best_id, + ) + right_section = _render_side( + right_label, right_nodes, right_metrics, right_best_id, + ) + + return _COMPARE_PAGE_TEMPLATE.format( + title=html.escape(title), + css=_CSS + _COMPARE_CSS, + delta=delta_line, + left_section=left_section, + right_section=right_section, + generated_at=html.escape(datetime.utcnow().strftime("%Y-%m-%d %H:%M:%S UTC")), + ) + + +# --------------------------------------------------------------------------- +# Internal helpers +# --------------------------------------------------------------------------- + + +def _safe_load_nodes(path: Path | str) -> Sequence[AttemptNode]: + """Load ledger nodes; ledger-read errors propagate (don't silently zero).""" + return TrialLedger(Path(path)).all_nodes() + + +def _render_side( + label: str, + nodes: Sequence[AttemptNode], + metric_lookup: dict[str, float], + best_id: str | None, +) -> str: + """Render one column: header label + summary pills + tree.""" + safe_label = html.escape(label) + if not nodes: + return ( + f'
' + f'

{safe_label}

' + f'
(no attempts in this ledger)
' + f'
' + ) + summary = _render_summary(nodes, metric_lookup, best_id) + cards = _render_tree(nodes, best_id, metric_lookup) + return ( + f'
' + f'

{safe_label}

' + f'{summary}' + f'
{cards}
' + f'
' + ) + + +def _render_delta( + left_best_id: str | None, + right_best_id: str | None, + left_metrics: dict[str, float], + right_metrics: dict[str, float], + left_direction: str | None, + right_direction: str | None, +) -> str: + """Render the Δ line — only when both directions agree and both metrics + are available. Otherwise return an empty string (no auto-verdict).""" + if left_best_id is None or right_best_id is None: + return "" + if left_direction is None or right_direction is None: + return "" + if left_direction != right_direction: + return "" + left_v = left_metrics.get(left_best_id) + right_v = right_metrics.get(right_best_id) + if left_v is None or right_v is None: + return "" + delta = right_v - left_v + sign = "+" if delta >= 0 else "" + return ( + f'
' + f'left best: {left_v}' + f'  |  ' + f'right best: {right_v}' + f'  |  ' + f'Δ (right − left): {sign}{delta}' + f' (arithmetic delta only — no winner verdict)' + f'
' + ) + + +# --------------------------------------------------------------------------- +# Compare-specific CSS + page template (single-view CSS reused as base) +# --------------------------------------------------------------------------- + + +_COMPARE_CSS = """ +.compare-grid { + display: grid; + grid-template-columns: 1fr 1fr; + gap: 24px; + align-items: start; +} +.side { + background: #fff; + border-radius: 8px; + padding: 16px; + box-shadow: 0 1px 3px rgba(0,0,0,0.06); + min-width: 0; /* permit grid cell to shrink */ +} +.side-label { + margin: 0 0 12px 0; + font-size: 18px; + color: #1a237e; + border-bottom: 2px solid #e8eaf6; + padding-bottom: 6px; +} +.side-cards { padding-top: 8px; } +.delta { + background: #fff; + border-radius: 8px; + padding: 12px 16px; + box-shadow: 0 1px 3px rgba(0,0,0,0.06); + margin-bottom: 16px; + font-size: 14px; + color: #424242; +} +.delta-note { + color: #757575; + font-size: 12px; + margin-left: 8px; +} +@media (max-width: 1100px) { + .compare-grid { grid-template-columns: 1fr; } +} +""" + + +_COMPARE_PAGE_TEMPLATE = """\ + + + + +{title} + + + +
+

{title}

+
Generated {generated_at}
+ {delta} +
+ {left_section} + {right_section} +
+
+ + +""" diff --git a/tests/test_cli.py b/tests/test_cli.py index fb7a208..d133006 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -135,6 +135,98 @@ def test_compare_command(tmp_path): assert "b" in result.output +def _make_ledger_for_tag(project: Path, tag: str, *, num_keep: int = 2) -> None: + """Helper for compare --html tests: create a logs/run-/ledger.jsonl + with `num_keep` keep nodes so the renderer has something to draw.""" + from crucible.ledger import AttemptNode, TrialLedger + + ledger_dir = project / "logs" / f"run-{tag}" + ledger_dir.mkdir(parents=True, exist_ok=True) + ledger = TrialLedger(ledger_dir / "ledger.jsonl") + parent: str | None = None + for i in range(1, num_keep + 1): + node = AttemptNode( + id=AttemptNode.short_id(i), + parent_id=parent, + commit=f"sha-{tag}-{i:08x}", + outcome="keep", + cost_usd=0.001 * i, + created_at="2026-04-25T12:00:00+00:00", + ) + ledger.append_node(node) + parent = node.id + + +def test_compare_html_writes_report(tmp_path): + """`crucible compare a b --html` writes a side-by-side HTML report to + /reports/compare-a-vs-b.html with both labels present.""" + setup_project(tmp_path) + _make_ledger_for_tag(tmp_path, "a", num_keep=2) + _make_ledger_for_tag(tmp_path, "b", num_keep=3) + + runner = CliRunner() + result = runner.invoke( + main, ["compare", "a", "b", "--html", "--project-dir", str(tmp_path)] + ) + assert result.exit_code == 0, result.output + + target = tmp_path / "reports" / "compare-a-vs-b.html" + assert target.exists() + body = target.read_text() + assert "compare-grid" in body + assert ">a" in body or "side-label" in body + assert ">b" in body or "side-label" in body + # Both ledgers' nodes appear + assert "n000001" in body + assert "n000003" in body # only present on side b + + +def test_compare_html_custom_output(tmp_path): + setup_project(tmp_path) + _make_ledger_for_tag(tmp_path, "x") + _make_ledger_for_tag(tmp_path, "y") + + out_path = tmp_path / "explicit-out.html" + runner = CliRunner() + result = runner.invoke( + main, ["compare", "x", "y", "--html", + "--html-out", str(out_path), + "--project-dir", str(tmp_path)] + ) + assert result.exit_code == 0, result.output + assert out_path.exists() + assert "compare-grid" in out_path.read_text() + + +def test_compare_html_missing_ledger_errors_clearly(tmp_path): + """If one side's ledger.jsonl is missing, the command exits non-zero + with a clear message — does NOT silently render an empty side.""" + setup_project(tmp_path) + _make_ledger_for_tag(tmp_path, "only-left") + # No ledger for "missing-tag" + + runner = CliRunner() + result = runner.invoke( + main, ["compare", "only-left", "missing-tag", + "--html", "--project-dir", str(tmp_path)] + ) + assert result.exit_code != 0 + assert "ledger not found" in result.output.lower() + + +def test_compare_right_project_requires_html(tmp_path): + """--right-project is only meaningful with --html for v1.""" + setup_project(tmp_path) + runner = CliRunner() + result = runner.invoke( + main, ["compare", "a", "b", + "--right-project", str(tmp_path), + "--project-dir", str(tmp_path)] + ) + assert result.exit_code != 0 + assert "--right-project" in result.output + + def test_compare_json_output(tmp_path): setup_project(tmp_path) runner = CliRunner() diff --git a/tests/test_reporter_compare.py b/tests/test_reporter_compare.py new file mode 100644 index 0000000..ee49579 --- /dev/null +++ b/tests/test_reporter_compare.py @@ -0,0 +1,309 @@ +"""Tests for `crucible.reporter.compare.render_comparison_html` — M2 PR 11. + +Verifies: +- Both side labels appear in output +- Both ledgers' node ids appear (left + right trees) +- Best-of-run badge appears on each side independently +- Δ line appears when both directions agree, both bests exist +- Δ line is suppressed when directions differ or are None +- Empty ledger on one side → "(no attempts)" panel, other side still renders +- HTML is well-formed +- Labels are HTML-escaped +""" + +from __future__ import annotations + +from html.parser import HTMLParser +from pathlib import Path + +import pytest + +from crucible.ledger import AttemptNode, TrialLedger +from crucible.reporter import render_comparison_html + + +# --------------------------------------------------------------------------- +# Helpers (parallel to test_reporter_html.py) +# --------------------------------------------------------------------------- + + +class _Validator(HTMLParser): + def __init__(self) -> None: + super().__init__() + self.tags_open: list[str] = [] + self.errors: list[str] = [] + + def handle_starttag(self, tag: str, attrs) -> None: + if tag not in ("br", "hr", "meta", "img", "input", "link"): + self.tags_open.append(tag) + + def handle_endtag(self, tag: str) -> None: + if not self.tags_open: + self.errors.append(f"close-without-open: {tag}") + return + if self.tags_open[-1] == tag: + self.tags_open.pop() + + +def _validate(html_str: str) -> None: + p = _Validator() + p.feed(html_str) + assert not p.errors, f"HTML errors: {p.errors}" + + +def _make_node(seq: int, *, outcome: str = "keep", + parent: str | None = None) -> AttemptNode: + return AttemptNode( + id=AttemptNode.short_id(seq), + parent_id=parent, + commit=f"sha-{seq:08x}", + backend_kind="claude_sdk", + model="anthropic/sonnet-4-6", + outcome=outcome, + cost_usd=0.001 * seq, + created_at="2026-04-25T12:00:00+00:00", + ) + + +@pytest.fixture +def two_ledgers(tmp_path: Path) -> tuple[Path, Path]: + left = tmp_path / "left.jsonl" + right = tmp_path / "right.jsonl" + return left, right + + +# --------------------------------------------------------------------------- +# Smoke + basic structure +# --------------------------------------------------------------------------- + + +def test_compare_renders_both_sides(two_ledgers: tuple[Path, Path]): + left, right = two_ledgers + lL = TrialLedger(left) + lR = TrialLedger(right) + lL.append_node(_make_node(1, outcome="keep")) + lL.append_node(_make_node(2, parent="n000001", outcome="keep")) + lR.append_node(_make_node(1, outcome="keep")) + lR.append_node(_make_node(2, parent="n000001", outcome="discard")) + + out = render_comparison_html( + left, right, + left_label="greedy", + right_label="bfts-lite", + ) + + # Both labels present + assert "greedy" in out + assert "bfts-lite" in out + # Both ledgers' nodes present + assert 'id="n000001"' in out + assert 'id="n000002"' in out + # Compare grid present + assert "compare-grid" in out + _validate(out) + + +def test_compare_empty_ledger_one_side(two_ledgers: tuple[Path, Path]): + """Empty side renders 'no attempts' panel; other side renders normally.""" + left, right = two_ledgers + left.touch() # empty + lR = TrialLedger(right) + lR.append_node(_make_node(1, outcome="keep")) + + out = render_comparison_html( + left, right, + left_label="empty-run", + right_label="real-run", + ) + assert "no attempts" in out.lower() + assert "real-run" in out + assert 'id="n000001"' in out + _validate(out) + + +# --------------------------------------------------------------------------- +# Best-of-run highlighting (per side, independent) +# --------------------------------------------------------------------------- + + +def test_compare_best_marker_per_side(two_ledgers: tuple[Path, Path]): + left, right = two_ledgers + lL = TrialLedger(left) + lR = TrialLedger(right) + for i in (1, 2): + lL.append_node(_make_node(i, outcome="keep")) + for i in (1, 2): + lR.append_node(_make_node(i, outcome="keep")) + + out = render_comparison_html( + left, right, + left_label="A", right_label="B", + left_metric_lookup={"n000001": 1.0, "n000002": 2.0}, + right_metric_lookup={"n000001": 5.0, "n000002": 3.0}, + left_direction="maximize", right_direction="maximize", + ) + # Best in left = n000002 (2.0), best in right = n000001 (5.0) + # Both should appear as "★ best" in the rendered output. + assert out.count("★ best") == 2 + + +# --------------------------------------------------------------------------- +# Δ line rendering rules +# --------------------------------------------------------------------------- + + +def test_delta_renders_when_directions_agree(two_ledgers: tuple[Path, Path]): + left, right = two_ledgers + lL = TrialLedger(left) + lR = TrialLedger(right) + lL.append_node(_make_node(1, outcome="keep")) + lR.append_node(_make_node(1, outcome="keep")) + + out = render_comparison_html( + left, right, + left_label="A", right_label="B", + left_metric_lookup={"n000001": 1.0}, + right_metric_lookup={"n000001": 1.7}, + left_direction="maximize", right_direction="maximize", + ) + assert "Δ" in out + assert "+0.7" in out or "0.7" in out # right - left + assert "no winner verdict" in out.lower() + + +def test_delta_omitted_when_directions_differ(two_ledgers: tuple[Path, Path]): + left, right = two_ledgers + lL = TrialLedger(left) + lR = TrialLedger(right) + lL.append_node(_make_node(1, outcome="keep")) + lR.append_node(_make_node(1, outcome="keep")) + + out = render_comparison_html( + left, right, + left_label="A", right_label="B", + left_metric_lookup={"n000001": 1.0}, + right_metric_lookup={"n000001": 1.7}, + left_direction="maximize", right_direction="minimize", + ) + assert "Δ" not in out + + +def test_delta_omitted_when_direction_none(two_ledgers: tuple[Path, Path]): + left, right = two_ledgers + lL = TrialLedger(left) + lR = TrialLedger(right) + lL.append_node(_make_node(1, outcome="keep")) + lR.append_node(_make_node(1, outcome="keep")) + + out = render_comparison_html( + left, right, + left_label="A", right_label="B", + left_metric_lookup={"n000001": 1.0}, + right_metric_lookup={"n000001": 1.7}, + left_direction=None, right_direction=None, + ) + assert "Δ" not in out + + +def test_delta_omitted_when_metric_lookup_empty(two_ledgers: tuple[Path, Path]): + left, right = two_ledgers + lL = TrialLedger(left) + lR = TrialLedger(right) + lL.append_node(_make_node(1, outcome="keep")) + lR.append_node(_make_node(1, outcome="keep")) + + out = render_comparison_html( + left, right, + left_label="A", right_label="B", + # no metric_lookup provided + left_direction="maximize", right_direction="maximize", + ) + assert "Δ" not in out + + +# --------------------------------------------------------------------------- +# Branching, parent chain, security +# --------------------------------------------------------------------------- + + +def test_compare_preserves_parent_relationship(two_ledgers: tuple[Path, Path]): + """Child cards still link to their parent on each side.""" + left, right = two_ledgers + lL = TrialLedger(left) + lR = TrialLedger(right) + lL.append_node(_make_node(1)) + lL.append_node(_make_node(2, parent="n000001")) + lR.append_node(_make_node(1)) + lR.append_node(_make_node(2, parent="n000001")) + lR.append_node(_make_node(3, parent="n000001")) # branch on right side + + out = render_comparison_html( + left, right, + left_label="greedy", right_label="bfts-lite", + ) + # Both sides reference the parent + assert out.count('href="#n000001"') >= 2 + # Right side has a branch (third node also under n000001) + assert 'id="n000003"' in out + _validate(out) + + +def test_compare_html_escapes_labels(two_ledgers: tuple[Path, Path]): + left, right = two_ledgers + lL = TrialLedger(left) + lR = TrialLedger(right) + lL.append_node(_make_node(1)) + lR.append_node(_make_node(1)) + + nasty_label = "" + out = render_comparison_html( + left, right, + left_label=nasty_label, + right_label="ok", + ) + assert "" not in out + assert "<script>" in out + + +def test_compare_custom_title(two_ledgers: tuple[Path, Path]): + left, right = two_ledgers + lL = TrialLedger(left) + lR = TrialLedger(right) + lL.append_node(_make_node(1)) + lR.append_node(_make_node(1)) + + out = render_comparison_html( + left, right, + left_label="A", right_label="B", + title="My Custom Compare Title", + ) + assert "My Custom Compare Title" in out + assert "

My Custom Compare Title

" in out + + +# --------------------------------------------------------------------------- +# Direction asymmetry: per-side best uses each side's direction +# --------------------------------------------------------------------------- + + +def test_per_side_direction_picks_correct_best(two_ledgers: tuple[Path, Path]): + """Left=minimize, right=maximize. Each side picks its own best.""" + left, right = two_ledgers + lL = TrialLedger(left) + lR = TrialLedger(right) + for i in (1, 2): + lL.append_node(_make_node(i, outcome="keep")) + lR.append_node(_make_node(i, outcome="keep")) + + out = render_comparison_html( + left, right, + left_label="MIN", right_label="MAX", + left_metric_lookup={"n000001": 5.0, "n000002": 1.0}, # min picks n2 + right_metric_lookup={"n000001": 5.0, "n000002": 1.0}, # max picks n1 + left_direction="minimize", right_direction="maximize", + ) + # Both sides have best markers; this also implicitly verifies neither + # crashed when directions disagree. + assert out.count("★ best") == 2 + # Δ line MUST be omitted because directions differ + assert "Δ" not in out From cdd9ec2a4a18b1339ccd2618a884bb18c232d7e8 Mon Sep 17 00:00:00 2001 From: suzuke Date: Sat, 25 Apr 2026 22:55:23 +0800 Subject: [PATCH 2/3] fix(m2): namespace compare-mode DOM ids with side prefixes (reviewer F1) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Reviewer round 2 REJECTED the original PR 11 because both ledgers in a compare HTML normally share AttemptNode ids (n000001, n000002…), so rendering two trees produced duplicate `id="n000001"` elements and ambiguous `href="#n000001"` anchors. Fixed by namespacing every DOM id and intra-document anchor with a side-scoped prefix. Changes: - `_render_tree`, `_render_card`, `_render_summary` accept `anchor_prefix: str = ""` (kwarg-only). Default empty → single-view output unchanged. - `compare.py` passes `"left-"` / `"right-"` so `id="left-n000001"` and `id="right-n000001"` coexist; parent links and best-summary links use the same prefixed anchors. Display text remains the bare node id — the prefix is implementation detail, not user-facing. Tests: - Existing compare tests updated to assert side-scoped anchors AND that bare ids (which would collide) do NOT appear. - 2 new dedicated tests: `test_compare_dom_ids_are_unique_per_side` (no collision across 3-node × 2-side ledger) and `test_compare_best_link_uses_side_anchor` (best-link clicks land on the same-side card). - HTML validator tightened to assert `not p.tags_open` at EOF (reviewer non-blocker — catches stray unclosed tags). Full suite: 2415 passed / 4 skipped, 0 regressions. Co-Authored-By: Claude Opus 4.7 (1M context) --- src/crucible/reporter/compare.py | 19 +++++-- src/crucible/reporter/html_tree.py | 44 +++++++++++---- tests/test_reporter_compare.py | 86 ++++++++++++++++++++++++++---- 3 files changed, 127 insertions(+), 22 deletions(-) diff --git a/src/crucible/reporter/compare.py b/src/crucible/reporter/compare.py index 6c3ca43..08abd6a 100644 --- a/src/crucible/reporter/compare.py +++ b/src/crucible/reporter/compare.py @@ -78,9 +78,11 @@ def render_comparison_html( left_section = _render_side( left_label, left_nodes, left_metrics, left_best_id, + anchor_prefix="left-", ) right_section = _render_side( right_label, right_nodes, right_metrics, right_best_id, + anchor_prefix="right-", ) return _COMPARE_PAGE_TEMPLATE.format( @@ -108,8 +110,15 @@ def _render_side( nodes: Sequence[AttemptNode], metric_lookup: dict[str, float], best_id: str | None, + *, + anchor_prefix: str, ) -> str: - """Render one column: header label + summary pills + tree.""" + """Render one column: header label + summary pills + tree. + + `anchor_prefix` namespaces this side's DOM ids and intra-doc anchors + (e.g. "left-", "right-") so two trees with overlapping AttemptNode + ids can coexist in one HTML document without collisions. + """ safe_label = html.escape(label) if not nodes: return ( @@ -118,8 +127,12 @@ def _render_side( f'
(no attempts in this ledger)
' f'' ) - summary = _render_summary(nodes, metric_lookup, best_id) - cards = _render_tree(nodes, best_id, metric_lookup) + summary = _render_summary( + nodes, metric_lookup, best_id, anchor_prefix=anchor_prefix + ) + cards = _render_tree( + nodes, best_id, metric_lookup, anchor_prefix=anchor_prefix + ) return ( f'
' f'

{safe_label}

' diff --git a/src/crucible/reporter/html_tree.py b/src/crucible/reporter/html_tree.py index 5eaa540..e8624d9 100644 --- a/src/crucible/reporter/html_tree.py +++ b/src/crucible/reporter/html_tree.py @@ -134,6 +134,8 @@ def _render_tree( nodes: Sequence[AttemptNode], best_id: str | None, metric_lookup: dict[str, float], + *, + anchor_prefix: str = "", ) -> str: """M1b: render nodes in DFS-by-parent order with depth indentation. @@ -144,6 +146,12 @@ def _render_tree( Children are sorted by id (insertion order is the natural fallback when ids are sequential like n000042). + + `anchor_prefix` (M2 PR 11): when non-empty, every DOM `id` and + intra-document `href="#..."` is namespaced with this prefix. Single- + view callers leave it empty (output unchanged); compare-view callers + pass `"left-"` / `"right-"` so two trees can coexist in one HTML doc + without colliding on identical attempt IDs. """ by_parent: dict[str | None, list[AttemptNode]] = {} for n in nodes: @@ -163,7 +171,8 @@ def walk(parent_id: str | None, depth: int) -> None: if child.id in visited: continue # defensive: skip cycles (shouldn't happen) visited.add(child.id) - out.append(_render_card(child, best_id, metric_lookup, depth=depth)) + out.append(_render_card(child, best_id, metric_lookup, + depth=depth, anchor_prefix=anchor_prefix)) walk(child.id, depth + 1) # Roots are nodes with parent_id=None @@ -175,7 +184,7 @@ def walk(parent_id: str | None, depth: int) -> None: for n in nodes: if n.id not in visited: out.append(_render_card(n, best_id, metric_lookup, depth=0, - orphan=True)) + orphan=True, anchor_prefix=anchor_prefix)) return "\n".join(out) @@ -212,7 +221,9 @@ def _best_node_id(nodes: Sequence[AttemptNode], def _render_summary(nodes: Sequence[AttemptNode], metric_lookup: dict[str, float], - best_id: str | None = None) -> str: + best_id: str | None = None, + *, + anchor_prefix: str = "") -> str: by_outcome: dict[str, int] = {} for n in nodes: by_outcome[n.outcome] = by_outcome.get(n.outcome, 0) + 1 @@ -232,9 +243,13 @@ def _render_summary(nodes: Sequence[AttemptNode], best_line = "" if best_id is not None: v = metric_lookup.get(best_id) + # M2 PR 11: anchor uses the side-scoped prefix so compare-mode + # `Best metric (...)` link points to the right column's card. + # Display text remains the raw node id (no prefix shown to user). + anchor = html.escape(anchor_prefix) + html.escape(best_id) best_line = ( f'
' + f'(node {html.escape(best_id)})' ) total_cost = sum((n.cost_usd or 0.0) for n in nodes) @@ -251,7 +266,9 @@ def _render_summary(nodes: Sequence[AttemptNode], def _render_card(n: AttemptNode, best_id: str | None, metric_lookup: dict[str, float], depth: int = 0, - orphan: bool = False) -> str: + orphan: bool = False, + *, + anchor_prefix: str = "") -> str: fg, bg = _color_for(n.outcome) is_best = (best_id is not None and n.id == best_id) badge = '★ best' if is_best else "" @@ -267,10 +284,16 @@ def _render_card(n: AttemptNode, best_id: str | None, f'{metric_lookup[n.id]}' ) - parent_line = ( - f'{n.parent_id}' - if n.parent_id else "(root)" - ) + # M2 PR 11: parent links resolve within the same side. The displayed + # text is the bare parent id (no prefix shown to the reader). + if n.parent_id: + parent_anchor = html.escape(anchor_prefix) + html.escape(n.parent_id) + parent_line = ( + f'' + f'{html.escape(n.parent_id)}' + ) + else: + parent_line = "(root)" diff_block = "" if n.diff_text: @@ -296,8 +319,9 @@ def _render_card(n: AttemptNode, best_id: str | None, if depth: card_style += f";margin-left:{depth * 32}px" + article_id = html.escape(anchor_prefix) + html.escape(n.id) return f""" -
{branch_marker}{html.escape(n.id)} diff --git a/tests/test_reporter_compare.py b/tests/test_reporter_compare.py index ee49579..a1b7a56 100644 --- a/tests/test_reporter_compare.py +++ b/tests/test_reporter_compare.py @@ -49,6 +49,9 @@ def _validate(html_str: str) -> None: p = _Validator() p.feed(html_str) assert not p.errors, f"HTML errors: {p.errors}" + # M2 PR 11 reviewer non-blocker: validator must also assert no + # unclosed tags at EOF, otherwise a stray `
` would slip past. + assert not p.tags_open, f"unclosed tags at EOF: {p.tags_open}" def _make_node(seq: int, *, outcome: str = "keep", @@ -95,9 +98,16 @@ def test_compare_renders_both_sides(two_ledgers: tuple[Path, Path]): # Both labels present assert "greedy" in out assert "bfts-lite" in out - # Both ledgers' nodes present - assert 'id="n000001"' in out - assert 'id="n000002"' in out + # Both ledgers' nodes present — under their side-scoped DOM ids. + assert 'id="left-n000001"' in out + assert 'id="left-n000002"' in out + assert 'id="right-n000001"' in out + assert 'id="right-n000002"' in out + # Critically: NO bare DOM id (would collide between sides). + assert 'id="n000001"' not in out + assert 'id="n000002"' not in out + # Display text still shows the bare node id (no prefix shown to user). + assert '>n000001<' in out # Compare grid present assert "compare-grid" in out _validate(out) @@ -117,7 +127,9 @@ def test_compare_empty_ledger_one_side(two_ledgers: tuple[Path, Path]): ) assert "no attempts" in out.lower() assert "real-run" in out - assert 'id="n000001"' in out + # Right-side node uses the right-prefixed anchor (no bare id collision). + assert 'id="right-n000001"' in out + assert 'id="n000001"' not in out _validate(out) @@ -227,7 +239,8 @@ def test_delta_omitted_when_metric_lookup_empty(two_ledgers: tuple[Path, Path]): def test_compare_preserves_parent_relationship(two_ledgers: tuple[Path, Path]): - """Child cards still link to their parent on each side.""" + """Child cards still link to their parent on each side — but parent + links are SIDE-SCOPED so they resolve unambiguously.""" left, right = two_ledgers lL = TrialLedger(left) lR = TrialLedger(right) @@ -241,10 +254,13 @@ def test_compare_preserves_parent_relationship(two_ledgers: tuple[Path, Path]): left, right, left_label="greedy", right_label="bfts-lite", ) - # Both sides reference the parent - assert out.count('href="#n000001"') >= 2 - # Right side has a branch (third node also under n000001) - assert 'id="n000003"' in out + # Each side has its own parent-link anchor (left-n000001 / right-n000001). + assert 'href="#left-n000001"' in out + assert 'href="#right-n000001"' in out + # Bare anchors that would be ambiguous must NOT appear. + assert 'href="#n000001"' not in out + # Right side has a branch (third node also under n000001). + assert 'id="right-n000003"' in out _validate(out) @@ -307,3 +323,55 @@ def test_per_side_direction_picks_correct_best(two_ledgers: tuple[Path, Path]): assert out.count("★ best") == 2 # Δ line MUST be omitted because directions differ assert "Δ" not in out + + +# --------------------------------------------------------------------------- +# Anchor uniqueness + side-scoping (reviewer round 2 blocking finding) +# --------------------------------------------------------------------------- + + +def test_compare_dom_ids_are_unique_per_side(two_ledgers: tuple[Path, Path]): + """Both ledgers normally share AttemptNode ids (n000001, n000002…). + Compare HTML MUST namespace them so each id appears exactly once + in the document. This is the reviewer round 2 blocking finding.""" + left, right = two_ledgers + lL = TrialLedger(left) + lR = TrialLedger(right) + for i in (1, 2, 3): + lL.append_node(_make_node(i, outcome="keep")) + lR.append_node(_make_node(i, outcome="keep")) + + out = render_comparison_html( + left, right, + left_label="L", right_label="R", + ) + # Each side gets a unique DOM id per node. + for i in (1, 2, 3): + nid = f"n{i:06d}" + assert out.count(f'id="left-{nid}"') == 1 + assert out.count(f'id="right-{nid}"') == 1 + # No bare colliding ids. + assert f'id="{nid}"' not in out + + +def test_compare_best_link_uses_side_anchor(two_ledgers: tuple[Path, Path]): + """The summary's `(node …)` link must point to the same-side card, + otherwise clicking on the right side's "Best metric" would scroll + to the left side's card with a colliding id.""" + left, right = two_ledgers + lL = TrialLedger(left) + lR = TrialLedger(right) + for i in (1, 2): + lL.append_node(_make_node(i, outcome="keep")) + lR.append_node(_make_node(i, outcome="keep")) + + out = render_comparison_html( + left, right, + left_label="L", right_label="R", + left_metric_lookup={"n000001": 1.0, "n000002": 5.0}, # left best n2 + right_metric_lookup={"n000001": 9.0, "n000002": 7.0}, # right best n1 + left_direction="maximize", right_direction="maximize", + ) + # Summary's best-link is side-scoped. + assert 'href="#left-n000002"' in out + assert 'href="#right-n000001"' in out From 03390bb4efc47e8ef41ac890326ce116f27762d8 Mon Sep 17 00:00:00 2001 From: suzuke Date: Sat, 25 Apr 2026 23:54:26 +0800 Subject: [PATCH 3/3] docs(m2): real-agent 30-iter demo gate results MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit First demo where BFTS-lite materially outperforms greedy because greedy hits max_retries=5 hard-stop while BFTS keeps exploring via BranchFrom + doom-loop pruning. Greedy: 9 iter, best 2.2528, stopped at 5-consecutive-failure wall BFTS: 30 iter, best 2.5013, clean max_iterations stop Total: $2.05, ~55 min wall (parallel runs) BFTS ledger shows 6 BranchFrom events and 4 nodes explicitly pruned by the M2 PR 10 doom-loop seam (n3, n21, n20, n19 each had 3 trailing failures → pruned from candidate set). Best result (2.5013 at iter 21) came from a deep path n1→n2→n9→n12→n13→n14→n17→n19→n20→n21 — 10 levels deep, well beyond what greedy reached before its hard-stop. Compare HTML rendered via the new `crucible compare --html` (M2 PR 11); file at /tmp/m2-30-compare.html locally, not committed (126 KB). Co-Authored-By: Claude Opus 4.7 (1M context) --- docs/M2-DEMO-GATE.md | 148 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 148 insertions(+) create mode 100644 docs/M2-DEMO-GATE.md diff --git a/docs/M2-DEMO-GATE.md b/docs/M2-DEMO-GATE.md new file mode 100644 index 0000000..a38919e --- /dev/null +++ b/docs/M2-DEMO-GATE.md @@ -0,0 +1,148 @@ +# M2 Demo Gate — 30-iter BFTS-lite vs Greedy + +**Date**: 2026-04-25 +**Spec reference**: `docs/v1.0-design-final.md` §M2 deliverable demo gate +**Runtime**: ~55 min wall (parallel runs), $2.05 total (Claude Code subscription) + +## TL;DR + +First real-agent comparison where **BFTS-lite materially outperforms greedy** because greedy hit `max_retries=5` and gave up while BFTS kept exploring via `BranchFrom` + doom-loop pruning: + +| | Iters | Best `compression_ratio` | Stop reason | Cost | +|---|---|---|---|---| +| Greedy | **9** | 2.2528 | 5 consecutive failures, hard-stop | $0.97 | +| BFTS-lite | **30** | **2.5013** | `max_iterations=30` reached | $1.08 | + +**Greedy stopped at iter 9** when 5 consecutive `discard` outcomes hit `constraints.max_retries` — exactly the failure mode that v1.0 §M2 doom-loop pruning + M1b `BranchFrom` are designed to escape. + +**BFTS reached iter 30 with 11 keeps + 19 discards**, demonstrating multi-level branch recovery: each time a kept node accumulated 3 trailing failures (M2 PR 10 `prune_threshold=3`), the strategy fell back to a higher unpruned ancestor and tried again. + +This is M1b's first 3-iter sanity gate run at scale, with the M2 PR 10 doom-loop pruner actually firing. + +## 1. Setup + +Both runs used the bundled `optimize-compress` example, identical workspace fixtures from M1b's demo gate (`~/Documents/Hack/crucible-demo-gate/compress-{greedy,bfts}/`), with `--max-iterations 30 --no-interactive`. Tag: `m2-30`. + +Configuration: +- Greedy: `search.strategy: greedy` (default plateau / max_retries behaviour) +- BFTS-lite: `search.strategy: bfts-lite` + `search.prune_threshold: 3` (M2 PR 10) + +Crucible installed from `feat/m2-reporter-compare` worktree (PR 10 doom-loop + PR 11 compare mode merged in this branch's stack). + +## 2. Greedy — hits the wall at iter 9 + +``` +n000001 keep parent=None (baseline replaced w/ huffman, still buggy → 0.0) +n000002 discard parent=n000001 +n000003 keep parent=n000001 (1.4154 — stride encoding) +n000004 keep parent=n000003 (2.2528 — best!) +n000005 discard parent=n000004 ┐ +n000006 discard parent=n000004 │ +n000007 discard parent=n000004 │ greedy keeps poking n000004 +n000008 discard parent=n000004 │ but every variant is worse +n000009 discard parent=n000004 ┘ + ⛔ "5 consecutive failures, stopping." +``` + +Greedy's `parent_id = code ancestry` (M1b PR 8a) shows iter 5-9 all chained to `n000004`. The orchestrator's legacy `max_retries` stop fires because `Continue` doesn't have a way to back out. + +**Best metric**: `compression_ratio = 2.2528` (4.4× the 0.5122 baseline) +**Wall time**: ~20 min +**Cost**: $0.97 (~$0.11/iter — ate failure-streak token cost) + +## 3. BFTS-lite — branch, prune, recover + +``` +n000001 keep parent=None ← root (baseline) +n000002 keep parent=n000001 +n000003 keep parent=n000002 ┐ +n000004 discard parent=n000003 │ 3 children of n3 all discard +n000005 discard parent=n000003 │ → n3 gets pruned (M2 PR 10) +n000006 discard parent=n000003 ┘ +n000007 discard parent=n000002 ↰ BFTS branches back to n2 +n000008 discard parent=n000002 (n2 now also accumulating failures) +n000009 keep parent=n000002 ✓ recovery! +n000010 discard parent=n000009 ┐ +n000011 discard parent=n000009 │ +n000012 keep parent=n000009 ✓ recovery again +n000013 keep parent=n000012 +n000014 keep parent=n000013 +n000015 discard parent=n000014 ┐ +n000016 discard parent=n000014 │ +n000017 keep parent=n000014 ✓ recovery +n000018 discard parent=n000017 +n000019 keep parent=n000017 ✓ +n000020 keep parent=n000019 +n000021 keep parent=n000020 ★ best 2.5013 +n000022 discard parent=n000021 ┐ +n000023 discard parent=n000021 │ n21 gets pruned +n000024 discard parent=n000021 ┘ +n000025 discard parent=n000020 ┐ branches back to n20 +n000026 discard parent=n000020 │ n20 also pruned +n000027 discard parent=n000020 ┘ +n000028 discard parent=n000019 ┐ branches back to n19 +n000029 discard parent=n000019 │ n19 also pruned +n000030 discard parent=n000019 ┘ + ⛔ max_iterations=30 reached +``` + +**Six branch-back events visible in the ledger** (n3→n2, n9→n9, n14→n14, n21→n20, n20→n19). Each one is BFTSLiteStrategy.decide() returning `BranchFrom(parent_id)` after the most-recent kept node's children consistently failed. + +The doom-loop pruning seam (PR 10) explicitly took n3, n21, n20, n19 out of the candidate set after 3 trailing failures each. By iter 30, BFTS had pruned much of the recent path; given more iterations it would have either continued backtracking deeper or hit "all kept nodes pruned (doom-loop) → Stop". + +**Best metric**: `compression_ratio = 2.5013` at iter 21 (4.9× baseline, **+11% over greedy's best**) +**Iters completed**: 30/30 (clean strategy stop, not a failure stop) +**Wall time**: ~55 min +**Cost**: $1.08 (~$0.036/iter — much cheaper because failed expansions reuse parent cache) + +## 4. Side-by-side comparison + +Generated with the new M2 PR 11 `crucible compare --html`: + +```bash +crucible compare m2-30 m2-30 --html \ + --project-dir ~/Documents/Hack/crucible-demo-gate/compress-greedy \ + --right-project ~/Documents/Hack/crucible-demo-gate/compress-bfts \ + --html-out /tmp/m2-30-compare.html +``` + +**Output**: 126 KB self-contained HTML showing: +- Left column: greedy's 9-node linear chain (n1 → n3 → n4 + dead branches) +- Right column: BFTS's 30-node tree with visibly indented branch points +- Δ best metric line: `right − left = +0.2485` (raw delta only — no winner verdict, per reviewer constraint) +- Each side's `★ best` badge correctly placed (greedy on n4, BFTS on n21) +- DOM ids namespaced as `left-n000001` / `right-n000001` so the two trees coexist without anchor collision (M2 PR 11 reviewer round-2 fix) + +## 5. What this validates + +| | M1b 3-iter gate | **M2 30-iter gate** | +|---|---|---| +| End-to-end wiring | ✅ | ✅ | +| `parent_id` = code ancestry observable | ✅ | ✅ | +| Sealed `EvalResult` per iter | ✅ | ✅ | +| HTML tree-view renders | ✅ | ✅ | +| `BranchFrom` actually fires in real-agent runs | ⚠ once (compress-bfts iter 3) | ✅ **6 times across 30 iter** | +| `should_prune` doom-loop seam fires | ❌ no failure streaks observed | ✅ **n3, n21, n20, n19 explicitly pruned** | +| BFTS-lite empirically beats greedy | ❌ similar 1.71 vs 1.80 (3-iter noise) | ✅ **2.50 vs 2.25** (greedy hits wall, BFTS doesn't) | +| `crucible compare --html` end-to-end | ❌ N/A | ✅ rendered 126 KB report | + +## 6. What this still does NOT validate + +- **Statistical significance**: single run per strategy. A serious benchmark wants ≥3 seeds × 30 iter × 2 strategies = 6 runs. This sanity gate just shows the mechanism works at scale. +- **HMAC seal upgrade (M2 PR 12)**: still on `content-sha256:`; PR 12 will lift to `hmac-sha256::`. +- **smolagents AgentBackend (M2 PR 13)**: this run still used Claude Code SDK directly. Production smolagents+LiteLLM backend is M2 PR 13. +- **TrialLedger concurrency lock (M2 PR 14)**: parallel-worker support not exercised; both runs were sequential within their workspace. + +## 7. Operational notes + +- **Cost efficiency**: BFTS at $0.036/iter is **3× cheaper per iter** than greedy at $0.108/iter. Reason: BFTS's failed expansions branch off cached prompts, so the model spends fewer tokens reading large context. Greedy's late discards re-explore the same dead-end and produce huge diffs. +- **Wall time**: BFTS is ~3× slower in wall (55 vs 20 min) because it ran 3.3× the iterations. Per-iter wall is comparable. +- **Both runs used the user's CC subscription** (no API key); daily-budget tokens consumed via `claude_sdk` adapter. +- **Workspaces**: `~/Documents/Hack/crucible-demo-gate/compress-{greedy,bfts}/` (re-used from M1b gate, fresh `m2-30` tag → fresh `crucible/m2-30` branch on each). + +## 8. Next steps (M2 follow-ups) + +- **PR 12 HMAC seal upgrade** — `eval-result.json` `seal:` field upgrades from `content-sha256` to `hmac-sha256::` to close the integrity-vs-authenticity gap. +- **PR 13 smolagents AgentBackend** — productionise the POC adapter so users can swap LLM provider via LiteLLM without changing crucible code. +- **PR 14 TrialLedger concurrency lock** — worktree-level mutex so multiple workers can claim different attempts in parallel. +- **Multi-seed gate** — run 3 seeds × 2 strategies × 30 iter to upgrade this sanity check into a statistical claim.