Validate committed base analysis SHA

brovatten · brovatten · commit 78c137851ff9 · 2026-06-10T12:12:57.000+02:00
diff --git a/action.yml b/action.yml
@@ -352,15 +352,24 @@ runs:
       working-directory: target-repo
       env:
         BASE_SHA: ${{ steps.guard.outputs.base_sha }}
+        ACTION_PATH: ${{ github.action_path }}
       run: |
         BASE_DIR="${RUNNER_TEMP}/cb-base"
         HEAD_DIR="${RUNNER_TEMP}/cb-head"
         mkdir -p "$BASE_DIR" "$HEAD_DIR"
         echo "base_dir=$BASE_DIR" >> $GITHUB_OUTPUT
         echo "head_dir=$HEAD_DIR" >> $GITHUB_OUTPUT
         if git show "${BASE_SHA}:.codeboarding/analysis.json" > "${BASE_DIR}/analysis.json" 2>/dev/null; then
-          echo "committed=true" >> $GITHUB_OUTPUT
-          echo "Using committed .codeboarding/analysis.json at ${BASE_SHA}."
+          if python3 "$ACTION_PATH/scripts/cb_engine.py" validate-base \
+              --analysis "${BASE_DIR}/analysis.json" \
+              --expected-sha "$BASE_SHA"; then
+            echo "committed=true" >> $GITHUB_OUTPUT
+            echo "Using committed .codeboarding/analysis.json at ${BASE_SHA}."
+          else
+            rm -f "${BASE_DIR}/analysis.json"
+            echo "committed=false" >> $GITHUB_OUTPUT
+            echo "Committed baseline at ${BASE_SHA} is stale; will generate a fresh base analysis."
+          fi
         else
           rm -f "${BASE_DIR}/analysis.json"
           echo "committed=false" >> $GITHUB_OUTPUT
@@ -373,7 +382,7 @@ runs:
       uses: actions/cache/restore@v4
       with:
         path: ${{ runner.temp }}/cb-base
-        key: cb-base-${{ runner.os }}-${{ steps.guard.outputs.base_sha }}-d${{ inputs.depth_level }}-${{ inputs.engine_ref }}-${{ inputs.llm_provider }}-${{ inputs.agent_model }}-${{ inputs.parsing_model }}
+        key: cb-base-v2-${{ runner.os }}-${{ steps.guard.outputs.base_sha }}-d${{ inputs.depth_level }}-${{ inputs.engine_ref }}-${{ inputs.llm_provider }}-${{ inputs.agent_model }}-${{ inputs.parsing_model }}
 
     # A committed analysis.json gives the head analysis stable component ids,
     # but the engine's incremental path ALSO needs the base static_analysis.pkl
@@ -478,7 +487,7 @@ runs:
       uses: actions/cache/save@v4
       with:
         path: ${{ runner.temp }}/cb-base
-        key: cb-base-${{ runner.os }}-${{ steps.guard.outputs.base_sha }}-d${{ inputs.depth_level }}-${{ inputs.engine_ref }}-${{ inputs.llm_provider }}-${{ inputs.agent_model }}-${{ inputs.parsing_model }}
+        key: cb-base-v2-${{ runner.os }}-${{ steps.guard.outputs.base_sha }}-d${{ inputs.depth_level }}-${{ inputs.engine_ref }}-${{ inputs.llm_provider }}-${{ inputs.agent_model }}-${{ inputs.parsing_model }}
 
     - name: Analyze PR head (incremental from base)
       if: steps.guard.outputs.skip != 'true'
diff --git a/docs/COMMIT_STRATEGY.md b/docs/COMMIT_STRATEGY.md
@@ -43,6 +43,7 @@ The engine writes these under `.codeboarding/`:
 The warm-start — and the engine's incremental path itself — needs the pkl **and** its `.sha`: the cluster baseline that drives incremental lives only inside the pkl, so a committed `analysis.json` alone forces the head run into a full (LLM) fallback. The review action therefore guarantees the pair exists for the base SHA:
 
 - **No committed baseline:** the generated base analysis writes the pkl as a side effect; the artifact dir is saved in `actions/cache` keyed by base SHA / depth / engine ref.
+- **Committed baseline:** the action first requires `analysis.json.metadata.commit_hash` to equal the PR base SHA. A stale committed diagram is treated like no baseline, so the base is regenerated at the PR base commit before diffing.
 - **Committed baseline, cache miss:** the action *seeds* the pkl deterministically (`cb_engine.py seed`: LSP indexing + the same clustering call a full run makes — **no LLM calls**), then saves it to the same cache. Seeding is fail-open: if it fails, the head run falls back to a full analysis.
 
 Either way the head analysis is seeded from that directory and runs incrementally. This keeps the repo clean — the pkl never enters git — while the cache + seeding make incremental work from the first PR run.
diff --git a/scripts/cb_engine.py b/scripts/cb_engine.py
@@ -42,6 +42,40 @@ def _clear_dir(path: Path) -> None:
             child.unlink()
 
 
+def validate_base_analysis(analysis_path: Path, expected_sha: str) -> tuple[bool, str]:
+    """Return whether ``analysis.json`` was generated for ``expected_sha``.
+
+    The PR action can only reuse a committed baseline when the diagram's own
+    source commit matches the PR base commit. Otherwise the diff would be
+    computed from the PR base while mutating an older diagram snapshot.
+    """
+    try:
+        data = json.loads(analysis_path.read_text(encoding="utf-8"))
+    except FileNotFoundError:
+        return False, f"Baseline analysis is missing: {analysis_path}"
+    except (OSError, json.JSONDecodeError) as exc:
+        return False, f"Baseline analysis is unreadable: {exc}"
+
+    if not isinstance(data, dict):
+        return False, "Baseline analysis root is not a JSON object."
+
+    metadata = data.get("metadata")
+    if not isinstance(metadata, dict):
+        return False, "Baseline analysis metadata is missing."
+
+    actual_sha = metadata.get("commit_hash")
+    if not isinstance(actual_sha, str) or not actual_sha:
+        return False, "Baseline analysis metadata.commit_hash is missing."
+
+    if actual_sha != expected_sha:
+        return (
+            False,
+            f"Baseline analysis was generated for {actual_sha}, expected PR base {expected_sha}.",
+        )
+
+    return True, f"Baseline analysis commit matches PR base {expected_sha}."
+
+
 def run_base(repo: str, out: str, name: str, run_id: str, depth: int, source_sha: str) -> None:
     from codeboarding_workflows.analysis import run_full
 
@@ -199,6 +233,10 @@ def main(argv=None) -> int:
     for a in ("--artifact-dir", "--repo", "--name", "--issues-out"):
         hc.add_argument(a, required=True)
 
+    vb = sub.add_parser("validate-base")
+    vb.add_argument("--analysis", required=True)
+    vb.add_argument("--expected-sha", required=True)
+
     args = p.parse_args(argv)
     if args.cmd == "base":
         run_base(args.repo, args.out, args.name, args.run_id, args.depth, args.source_sha)
@@ -208,6 +246,10 @@ def main(argv=None) -> int:
         run_head(args.repo, args.out, args.name, args.run_id, args.depth, args.base_ref, args.target_ref, args.source_sha)
     elif args.cmd == "health":
         Path(args.issues_out).write_text(str(run_health(args.artifact_dir, args.repo, args.name)))
+    elif args.cmd == "validate-base":
+        ok, message = validate_base_analysis(Path(args.analysis), args.expected_sha)
+        print(message)
+        return 0 if ok else 1
     return 0
 
 
diff --git a/scripts/run_local.sh b/scripts/run_local.sh
@@ -84,11 +84,14 @@ else
   [ -d "$ENGINE" ] || { echo "Engine not found at $ENGINE (set --engine or \$ENGINE)." >&2; exit 2; }
   [ -n "${OPENROUTER_API_KEY:-}" ] || { echo "Export OPENROUTER_API_KEY for the full pipeline." >&2; exit 2; }
   REPO="$(cd "$REPO" && pwd)"
+  BASE_SHA="$(git -C "$REPO" rev-parse "$BASE_REF^{commit}")"
+  HEAD_SHA="$(git -C "$REPO" rev-parse "$HEAD_REF^{commit}")"
   BASE_DIR="$OUT/base"; HEAD_DIR="$OUT/head"
   rm -rf "$BASE_DIR" "$HEAD_DIR"; mkdir -p "$BASE_DIR" "$HEAD_DIR"
 
-  echo "== Resolving base analysis at $BASE_REF =="
-  if git -C "$REPO" show "$BASE_REF:.codeboarding/analysis.json" > "$BASE_DIR/analysis.json" 2>/dev/null; then
+  echo "== Resolving base analysis at $BASE_SHA =="
+  if git -C "$REPO" show "$BASE_SHA:.codeboarding/analysis.json" > "$BASE_DIR/analysis.json" 2>/dev/null \
+     && run_engine validate-base --analysis "$BASE_DIR/analysis.json" --expected-sha "$BASE_SHA"; then
     echo "  using committed baseline"
     # Mirror action.yml: a committed analysis.json alone can't drive incremental —
     # the engine needs the base static_analysis.pkl with its cluster baseline.
@@ -97,8 +100,8 @@ else
     git -C "$REPO" worktree remove --force "$BASE_SRC" 2>/dev/null || true
     git -C "$REPO" worktree prune
     rm -rf "$BASE_SRC"
-    git -C "$REPO" worktree add --detach "$BASE_SRC" "$BASE_REF" >/dev/null
-    if run_engine seed --repo "$BASE_SRC" --out "$BASE_DIR" --source-sha "$BASE_REF"; then
+    git -C "$REPO" worktree add --detach "$BASE_SRC" "$BASE_SHA" >/dev/null
+    if run_engine seed --repo "$BASE_SRC" --out "$BASE_DIR" --source-sha "$BASE_SHA"; then
       echo "  seeded static-analysis baseline (no LLM)"
     else
       rm -f "$BASE_DIR/static_analysis.pkl" "$BASE_DIR/static_analysis.sha"
@@ -112,29 +115,29 @@ else
     git -C "$REPO" worktree remove --force "$BASE_SRC" 2>/dev/null || true
     git -C "$REPO" worktree prune
     rm -rf "$BASE_SRC"
-    git -C "$REPO" worktree add --detach "$BASE_SRC" "$BASE_REF" >/dev/null
+    git -C "$REPO" worktree add --detach "$BASE_SRC" "$BASE_SHA" >/dev/null
     run_engine base \
       --repo "$BASE_SRC" \
       --out "$BASE_DIR" \
       --name "$(basename "$REPO")" \
       --run-id local-base \
       --depth "$DEPTH" \
-      --source-sha "$BASE_REF"
+      --source-sha "$BASE_SHA"
     git -C "$REPO" worktree remove --force "$BASE_SRC" >/dev/null 2>&1 || true
     [ -f "$BASE_DIR/analysis.json" ] || { echo "Base full analysis ran but analysis.json is missing." >&2; exit 1; }
   fi
 
-  echo "== Analyzing head at $HEAD_REF (incremental from base) =="
+  echo "== Analyzing head at $HEAD_SHA (incremental from base) =="
   cp -a "$BASE_DIR"/. "$HEAD_DIR"/ 2>/dev/null || true
   run_engine head \
     --repo "$REPO" \
     --out "$HEAD_DIR" \
     --name "$(basename "$REPO")" \
     --run-id local-head \
     --depth "$DEPTH" \
-    --base-ref "$BASE_REF" \
-    --target-ref "$HEAD_REF" \
-    --source-sha "$HEAD_REF"
+    --base-ref "$BASE_SHA" \
+    --target-ref "$HEAD_SHA" \
+    --source-sha "$HEAD_SHA"
   [ -f "$HEAD_DIR/analysis.json" ] || { echo "Head analysis ran but analysis.json is missing." >&2; exit 1; }
   BASE_ANALYSIS="$BASE_DIR/analysis.json"
   HEAD_ANALYSIS="$HEAD_DIR/analysis.json"
diff --git a/tests/test_cb_engine.py b/tests/test_cb_engine.py
@@ -1,6 +1,7 @@
 """Smoke tests for scripts/cb_engine.py — verify it calls the engine API correctly,
 using stub modules so no real engine venv is needed."""
 
+import json
 import os
 import sys
 import tempfile
@@ -158,6 +159,53 @@ def test_head_falls_back_to_full_on_baseline_unavailable(self):
         self.assertEqual(len(rf.calls), 1)  # BaselineUnavailableError also triggers the full re-run
 
 
+class TestValidateBase(_Base):
+    def test_validate_base_accepts_matching_commit(self):
+        with tempfile.TemporaryDirectory() as tmp:
+            path = Path(tmp) / "analysis.json"
+            path.write_text(json.dumps({"metadata": {"commit_hash": "abc123"}}), encoding="utf-8")
+
+            ok, message = cb_engine.validate_base_analysis(path, "abc123")
+
+            self.assertTrue(ok)
+            self.assertIn("matches", message)
+
+    def test_validate_base_rejects_mismatched_commit(self):
+        with tempfile.TemporaryDirectory() as tmp:
+            path = Path(tmp) / "analysis.json"
+            path.write_text(json.dumps({"metadata": {"commit_hash": "old"}}), encoding="utf-8")
+
+            ok, message = cb_engine.validate_base_analysis(path, "new")
+
+            self.assertFalse(ok)
+            self.assertIn("old", message)
+            self.assertIn("new", message)
+
+    def test_validate_base_rejects_missing_commit(self):
+        with tempfile.TemporaryDirectory() as tmp:
+            path = Path(tmp) / "analysis.json"
+            path.write_text(json.dumps({"metadata": {}}), encoding="utf-8")
+
+            ok, message = cb_engine.validate_base_analysis(path, "abc123")
+
+            self.assertFalse(ok)
+            self.assertIn("commit_hash", message)
+
+    def test_main_validate_base_exit_codes(self):
+        with tempfile.TemporaryDirectory() as tmp:
+            path = Path(tmp) / "analysis.json"
+            path.write_text(json.dumps({"metadata": {"commit_hash": "abc123"}}), encoding="utf-8")
+
+            self.assertEqual(
+                cb_engine.main(["validate-base", "--analysis", str(path), "--expected-sha", "abc123"]),
+                0,
+            )
+            self.assertEqual(
+                cb_engine.main(["validate-base", "--analysis", str(path), "--expected-sha", "def456"]),
+                1,
+            )
+
+
 class TestSeed(_Base):
     """run_seed must analyze, cluster, then save — in that order, same results object.