v0.49.0: vstack-diagnose --baseline CI ratchet + fix vdiff on real reports

valani9 · valani9 · commit 4b1d01925494 · 2026-06-23T18:33:18.000+05:30
- Feature: vstack-diagnose --baseline &lt;report.json&gt; gates (with --fail-on) only
  on findings NEW vs a saved baseline — the CI ratchet (don't fail on pre-existing
  findings). Prints 'vs baseline: N new, M pre-existing'.
- Fix: vstack.vdiff.diff_reports assumed per_pattern was a name-keyed dict, but
  real DiagnoseReports carry it as a list -&gt; diff_reports / vstack-vdiff crashed on
  genuine vstack-diagnose output. Now normalizes both shapes (+regression test).
- _gate_exit_code now takes severities (gates all findings or only new ones).

3,236 tests.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -6,6 +6,36 @@ project adheres to [Semantic Versioning](https://semver.org/) from
 `1.0.0` onward. During the `0.x` series, minor bumps may include
 breaking changes (see API stability promise in `vstack/__init__.py`).
 
+## [0.49.0] — 2026-06-23
+
+The CI ratchet — gate only on *new* findings — plus a `vdiff` correctness fix.
+
+### Added
+
+- **`vstack-diagnose --baseline <report.json>`** — compare the current run
+  against a saved diagnose report and, with `--fail-on`, gate **only on
+  findings that are new relative to the baseline**. This is the standard
+  ratchet: a gate won't fail on pre-existing, already-accepted findings, and
+  it tightens as you re-baseline. Prints a `vs baseline: N new, M
+  pre-existing` summary to stderr.
+
+### Fixed
+
+- **`vstack.vdiff.diff_reports` crashed on real reports.** It assumed
+  `per_pattern` was a name-keyed dict, but actual `DiagnoseReport`s (and
+  their JSON) carry `per_pattern` as a *list*. `diff_reports` (and therefore
+  the `vstack-vdiff` CLI) raised `TypeError` on genuine `vstack-diagnose`
+  output; it now normalizes both shapes. Regression test added.
+
+### Changed
+
+- `_gate_exit_code` now takes a list of severities (so the gate can score
+  either all findings or only the new ones).
+
+### Compatibility
+
+- All tests pass. `--baseline` is opt-in; no breaking changes.
+
 ## [0.48.0] — 2026-06-23
 
 SARIF output — vstack findings now flow into GitHub code scanning (Security
diff --git a/README.md b/README.md
@@ -542,8 +542,12 @@ Not on GitHub Actions? The same gating works from any shell — the core CLI is
 ```bash
 vstack-diagnose --trace run.json --fail-on high     # exit 3 if any finding ≥ high
 vstack-diagnose --trace run.json --sarif > vstack.sarif   # SARIF 2.1.0 for any code-scanning tool
+vstack-diagnose --trace run.json --fail-on high \
+  --baseline last-good.json                          # ratchet: only fail on findings NEW vs the baseline
 ```
 
+The `--baseline` ratchet gates on *new* findings only, so a CI gate won't fail on pre-existing, already-accepted findings — save a report with `--json` once, commit it as the baseline, and the gate tightens over time.
+
 ## Framework adapters
 
 Same patterns, native to your framework:
diff --git a/_diagnose/lib/cli.py b/_diagnose/lib/cli.py
@@ -306,6 +306,15 @@ def main(argv: Sequence[str] | None = None) -> int:
             "Use to gate CI directly on the diagnosis. Omit to never fail on findings."
         ),
     )
+    parser.add_argument(
+        "--baseline",
+        default=None,
+        metavar="REPORT.json",
+        help=(
+            "Path to a saved diagnose report (JSON). When set, --fail-on gates only "
+            "on findings that are NEW relative to the baseline — the CI ratchet."
+        ),
+    )
     args = parser.parse_args(argv)
 
     if args.list:
@@ -381,23 +390,42 @@ def main(argv: Sequence[str] | None = None) -> int:
         else:
             print(report.to_markdown())
 
-    # CI gate: exit non-zero when a finding reaches the --fail-on threshold.
-    return _gate_exit_code(report.findings, args.fail_on)
+    # CI gate. With --baseline, gate only on findings NEW vs the baseline
+    # (the "ratchet" — don't fail CI on pre-existing, accepted findings).
+    gated_severities = [f.severity for f in report.findings]
+    if args.baseline is not None:
+        from vstack.vdiff import diff_reports
+
+        try:
+            baseline = json.loads(Path(args.baseline).read_text())
+        except (OSError, json.JSONDecodeError) as e:
+            print(f"vstack-diagnose: could not read --baseline: {e}", file=sys.stderr)
+            return 2
+        delta = diff_reports(baseline, report)
+        new = delta.added
+        gated_severities = [d.severity_after for d in new if d.severity_after]
+        print(
+            f"vs baseline: {len(new)} new finding(s), "
+            f"{len(report.findings) - len(new)} pre-existing.",
+            file=sys.stderr,
+        )
+
+    return _gate_exit_code(gated_severities, args.fail_on)
 
 
-def _gate_exit_code(findings: "list[Any]", fail_on: str | None) -> int:
-    """Return 3 if any finding is at/above ``fail_on``, else 0.
+def _gate_exit_code(severities: list[str], fail_on: str | None) -> int:
+    """Return 3 if any severity is at/above ``fail_on``, else 0.
 
     ``fail_on=None`` never gates. Factored out for direct testing.
     """
     if fail_on is None:
         return 0
     threshold = severity_rank(fail_on)
-    above = [f for f in findings if severity_rank(f.severity) >= threshold]
+    above = [s for s in severities if severity_rank(s) >= threshold]
     if above:
-        worst = max(above, key=lambda f: severity_rank(f.severity))
+        worst = max(above, key=severity_rank)
         print(
-            f"vstack-diagnose: gate failed — found {worst.severity} finding (>= {fail_on}).",
+            f"vstack-diagnose: gate failed — found {worst} finding (>= {fail_on}).",
             file=sys.stderr,
         )
         return 3
diff --git a/_diagnose/tests/test_diagnose_cli.py b/_diagnose/tests/test_diagnose_cli.py
@@ -136,19 +136,79 @@ def test_runs_diagnose_with_none_client_markdown_output() -> None:
 
 
 def test_gate_exit_code_logic() -> None:
-    from vstack.diagnose import Finding
     from vstack.diagnose.cli import _gate_exit_code
 
-    findings = [
-        Finding(pattern="p", severity="high", title="t"),
-        Finding(pattern="q", severity="low", title="u"),
-    ]
-    assert _gate_exit_code(findings, "high") == 3  # a high finding at/above 'high'
-    assert _gate_exit_code(findings, "critical") == 0  # nothing reaches 'critical'
-    assert _gate_exit_code(findings, None) == 0  # no gate
+    severities = ["high", "low"]
+    assert _gate_exit_code(severities, "high") == 3  # a high severity at/above 'high'
+    assert _gate_exit_code(severities, "critical") == 0  # nothing reaches 'critical'
+    assert _gate_exit_code(severities, None) == 0  # no gate
     assert _gate_exit_code([], "high") == 0  # no findings
 
 
+def test_baseline_ratchet_passes_on_preexisting_findings(tmp_path) -> None:
+    # Baseline already has a high finding; the current run (client none) finds
+    # nothing new, so --fail-on high passes because nothing is NEW.
+    baseline = tmp_path / "baseline.json"
+    baseline.write_text(
+        json.dumps(
+            {
+                "shape": "individual",
+                "findings": [
+                    {
+                        "pattern": "aar",
+                        "severity": "high",
+                        "title": "known",
+                        "evidence": "",
+                        "intervention": "",
+                    }
+                ],
+                "errors": {},
+                "per_pattern": [
+                    {"pattern": "aar", "elapsed_seconds": 0, "finding_count": 1, "error": None}
+                ],
+            }
+        )
+    )
+    payload = {
+        "agent_id": "a",
+        "goal": "g",
+        "steps": [{"timestamp": "2026-01-01T00:00:00Z", "type": "observation", "content": "x"}],
+        "outcome": "o",
+        "success": False,
+    }
+    code, _out, err = _run(
+        [
+            "--client",
+            "none",
+            "--shape",
+            "individual",
+            "--fail-on",
+            "high",
+            "--baseline",
+            str(baseline),
+        ],
+        stdin=json.dumps(payload),
+    )
+    assert code == 0
+    assert "new finding" in err  # the baseline summary line
+
+
+def test_baseline_missing_file_returns_2(tmp_path) -> None:
+    payload = {
+        "agent_id": "a",
+        "goal": "g",
+        "steps": [{"timestamp": "2026-01-01T00:00:00Z", "type": "observation", "content": "x"}],
+        "outcome": "o",
+        "success": False,
+    }
+    code, _out, err = _run(
+        ["--client", "none", "--baseline", str(tmp_path / "nope.json")],
+        stdin=json.dumps(payload),
+    )
+    assert code == 2
+    assert "--baseline" in err
+
+
 def test_fail_on_no_findings_exits_zero() -> None:
     # With --client none on a thin trace there are no findings, so the gate
     # passes (exit 0) even at the strictest threshold.
diff --git a/_packaging/vstack/__init__.py b/_packaging/vstack/__init__.py
@@ -73,7 +73,7 @@
 
 from __future__ import annotations
 
-__version__ = "0.48.0"
+__version__ = "0.49.0"
 
 # The diagnose() function and PATTERNS registry are lazy-imported below
 # so that ``import vstack`` itself stays cheap. Pattern sub-packages
diff --git a/_vdiff/lib/_diff.py b/_vdiff/lib/_diff.py
@@ -18,11 +18,25 @@ def _get_findings(report: Any) -> list[dict[str, Any]]:
 
 
 def _get_per_pattern(report: Any) -> dict[str, Any]:
+    """Return ``{pattern_name: entry}`` regardless of report shape.
+
+    A real ``DiagnoseReport`` (and its JSON form) carries ``per_pattern`` as a
+    *list* of per-pattern results (objects or dicts, each with a ``pattern``
+    field); older/synthetic reports use a dict keyed by pattern name. Normalize
+    both so diffing works on actual ``vstack-diagnose`` output.
+    """
     if isinstance(report, dict):
-        return dict(report.get("per_pattern", {}))
-    if hasattr(report, "per_pattern"):
-        return dict(report.per_pattern)
-    return {}
+        raw = report.get("per_pattern", [])
+    else:
+        raw = getattr(report, "per_pattern", [])
+    if isinstance(raw, dict):
+        return dict(raw)
+    out: dict[str, Any] = {}
+    for item in raw or []:
+        name = item.get("pattern") if isinstance(item, dict) else getattr(item, "pattern", None)
+        if name is not None:
+            out[str(name)] = item
+    return out
 
 
 def _finding_key(finding: Any) -> tuple[str, str]:
diff --git a/_vdiff/tests/test_diff.py b/_vdiff/tests/test_diff.py
@@ -210,3 +210,26 @@ class FakeReport:
 
         delta = diff_reports(FakeReport(), FakeReport())
         assert delta.before_count == 1
+
+
+def test_diff_handles_real_list_shaped_per_pattern():
+    """Regression: real DiagnoseReport JSON carries per_pattern as a LIST of
+    dicts (not a name-keyed dict). diff_reports must not crash on it."""
+    from vstack.vdiff import diff_reports
+
+    before = {
+        "shape": "individual",
+        "findings": [],
+        "per_pattern": [{"pattern": "lewin", "finding_count": 0, "error": None}],
+    }
+    after = {
+        "shape": "individual",
+        "findings": [{"pattern": "aar", "severity": "high", "title": "new"}],
+        "per_pattern": [
+            {"pattern": "lewin", "finding_count": 0, "error": None},
+            {"pattern": "aar", "finding_count": 1, "error": None},
+        ],
+    }
+    delta = diff_reports(before, after)
+    assert "aar" in delta.patterns_added
+    assert [d.title for d in delta.added] == ["new"]
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "valanistack"
-version = "0.48.0"
+version = "0.49.0"
 description = "Organizational behavior, practiced on AI agents."
 readme = "README.md"
 requires-python = ">=3.11"