From 3a9da04d0dc4bd319b3438be9fd4a7eb572d2fbe Mon Sep 17 00:00:00 2001 From: GeneAI Date: Sat, 16 May 2026 03:01:06 -0400 Subject: [PATCH] feat(polish): faithfulness judge integration (Phase 3 of polish-fact-check) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Phase 3 adds an opt-in faithfulness judge that scores polished documents against the source files they were generated from. When the score falls below the configured threshold, a `## Faithfulness review` block listing the unsupported claims and the judge's reasoning is appended to the polished file. Pairs with Phase 1 (AST fact-check after generation) and Phase 2 (ground-truth context injection before generation) to give three distinct interventions against polish-pass hallucinations. New package: src/attune_author/faithfulness/ - judge wrapper around attune_rag.eval.faithfulness.FaithfulnessJudge via asyncio.run (the polish pipeline is sync) - FaithfulnessConfig: threshold (0.95 pre-calibration default), budget_per_file_usd ($0.10), model (Sonnet 4.6 — Haiku is ~1/3 the cost), block_polish_on_unavailable for strict CI - estimate_cost_usd: chars-to-tokens heuristic + per-model price lookup, used as the budget gate so we never invoke the judge when the estimated cost exceeds the cap - format_review_block + apply_review_block: soft-fail formatter matching the Phase 1 ## Unresolved references shape Wiring: - generator._run_faithfulness_judge runs after _run_fact_check on every polished file. Reads optional pyproject config. - generator._faithfulness_telemetry / reset_faithfulness_telemetry: per-process counters; run_maintenance resets them at start and logs INFO summary at end (calls, skipped, total estimated $). - ATTUNE_AUTHOR_FAITHFULNESS=off env override. Best-effort contract: missing attune-rag[claude], missing ANTHROPIC_API_KEY, over-budget estimates, transient API failures all degrade silently. The judge never blocks the polish. Tests: 30 new tests under tests/unit/faithfulness/ covering the budget gate, every skip path, the happy path, the below-threshold review-block append, env-var override, telemetry reset, and unexpected-exception swallowing. Full suite: 926 passed, 37 pre-existing skips. Threshold calibration (tasks 3.3, 3.4) deferred to the same real-LLM run that closes Phase 2's live-LLM acceptance gate — folding two API cycles into one. Default of 0.95 is documented as pre-calibration in decisions.md. Spec: docs/specs/polish-fact-check/ Co-Authored-By: Claude Opus 4.7 --- CHANGELOG.md | 36 ++ README.md | 31 ++ docs/specs/polish-fact-check/decisions.md | 35 ++ docs/specs/polish-fact-check/tasks.md | 27 +- src/attune_author/faithfulness/__init__.py | 323 +++++++++++++++++ src/attune_author/faithfulness/config.py | 70 ++++ src/attune_author/generator.py | 90 +++++ src/attune_author/maintenance.py | 20 ++ tests/unit/faithfulness/__init__.py | 1 + tests/unit/faithfulness/test_config.py | 83 +++++ tests/unit/faithfulness/test_judge.py | 332 ++++++++++++++++++ .../unit/faithfulness/test_pipeline_wiring.py | 134 +++++++ 12 files changed, 1168 insertions(+), 14 deletions(-) create mode 100644 src/attune_author/faithfulness/__init__.py create mode 100644 src/attune_author/faithfulness/config.py create mode 100644 tests/unit/faithfulness/__init__.py create mode 100644 tests/unit/faithfulness/test_config.py create mode 100644 tests/unit/faithfulness/test_judge.py create mode 100644 tests/unit/faithfulness/test_pipeline_wiring.py diff --git a/CHANGELOG.md b/CHANGELOG.md index 1ebe71f..ea64f97 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -15,6 +15,42 @@ changes land, not at tag time. ### Added +- **Polish fact-check Phase 3 — faithfulness judge.** Wraps + `attune_rag.eval.faithfulness.FaithfulnessJudge` as a + post-polish step: scores each polished file's claims against + the source files it was generated from. When the score falls + below the configured threshold, appends a + `## Faithfulness review` block listing the unsupported claims + and the judge's reasoning. Best-effort: missing + `attune-rag[claude]`, missing `ANTHROPIC_API_KEY`, over-budget + estimates, and transient API failures all degrade silently + rather than blocking the polish. + - New package: `src/attune_author/faithfulness/` with the + judge wrapper, `FaithfulnessConfig`, `JudgeOutcome`, + `estimate_cost_usd` budget-gate helper, and a + `format_review_block` / `apply_review_block` soft-fail pair + that mirrors the Phase 1 `## Unresolved references` shape. + - Config schema: + `[tool.attune-author.fact-check.faithfulness]` with + `enabled`, `threshold` (default 0.95, pre-calibration), `budget_per_file_usd` + (default $0.10), `model` (default Sonnet 4.6 — Haiku 4.5 is + cheaper for high-volume runs), and + `block_polish_on_unavailable` (default False — flip to True + in CI where missing deps should be loud). + - Cost telemetry: per-process counters on the generator + module; `run_maintenance` resets them at start and logs an + INFO summary at end with call count, skip count, and total + estimated USD spent. + - `ATTUNE_AUTHOR_FAITHFULNESS=off` env-var override for one-off + disable without editing pyproject. + - 30 new tests under `tests/unit/faithfulness/`. + - Threshold calibration against the ops-dashboard fixture + (tasks 3.3, 3.4) is **deferred** until the live-LLM Phase 2 + acceptance run; today's default of 0.95 is documented as + pre-calibration in `decisions.md`. + - Spec: `docs/specs/polish-fact-check/`. Phase 4 (tutorial + code-fence mypy) remains on the roadmap. + - **Polish fact-check Phase 2 — ground-truth context injection.** Builds three sentinel-tagged blocks (``, ``, ``) and injects diff --git a/README.md b/README.md index 9dd14e9..7456dc8 100644 --- a/README.md +++ b/README.md @@ -162,6 +162,37 @@ CLI flags, fabricated private-module imports, wrong route paths, hallucinated counts) at the prompt layer, rather than relying solely on the post-generation fact-check to catch them. +## Faithfulness review (Phase 3) + +The Phase 3 faithfulness judge wraps +`attune_rag.eval.faithfulness.FaithfulnessJudge` as a +post-polish step: it scores each polished file's claims against +the source files it was generated from. When the score falls +below the configured threshold, a `## Faithfulness review` +block is appended to the polished file listing the unsupported +claims and the judge's reasoning. + +The judge is **opt-in** because it makes real Anthropic API +calls. To enable: + +```toml +[tool.attune-author.fact-check.faithfulness] +enabled = true +threshold = 0.95 # below this triggers a review block +budget_per_file_usd = 0.10 # skip if estimated cost exceeds cap +model = "claude-sonnet-4-6" # haiku is ~1/3 the cost +``` + +The judge is best-effort. Missing `attune-rag[claude]`, missing +`ANTHROPIC_API_KEY`, over-budget cost estimates, and transient +API failures all degrade silently rather than blocking the +polish. Set `block_polish_on_unavailable = true` in CI lanes +where missing deps should fail loudly instead. + +End-of-run telemetry (call count, skip count, total estimated +USD) logs at INFO level after `attune-author regenerate`. Set +`ATTUNE_AUTHOR_FAITHFULNESS=off` to disable for a single run. + ## Polish cache `attune-author` caches LLM polish responses on disk so re-generating an diff --git a/docs/specs/polish-fact-check/decisions.md b/docs/specs/polish-fact-check/decisions.md index 4913aec..f95a123 100644 --- a/docs/specs/polish-fact-check/decisions.md +++ b/docs/specs/polish-fact-check/decisions.md @@ -73,3 +73,38 @@ To be filled in during Phase 3 implementation: faithfulness judge ships, it will require its own real-LLM calibration run. Folding the cost-delta measurement into that run avoids two separate real-LLM cycles. +- 2026-05-16 — Phase 3 shipped. New decisions captured during + implementation: + - **Opt-in default**: `enabled=False` ships in + `FaithfulnessConfig` and the pyproject loader, because the + judge makes real Anthropic API calls and we shouldn't bill + users for it silently on the first run after install. The + Phase 1 fact-check is enabled by default (no API calls); the + Phase 3 judge is not. + - **Synchronous wrapper via `asyncio.run`**: the existing + polish pipeline is synchronous, so the async + `FaithfulnessJudge.score` coroutine is bridged with + `asyncio.run`. This precludes calling the judge from inside + a running event loop (we don't, today), but keeps the + surface aligned with the rest of attune-author. + - **Best-effort vs strict**: missing extras / missing API key + / transient failures all default to `JudgeOutcome(score=None, + skipped_reason=…)` rather than raising. CI lanes that need + loud failures opt in via `block_polish_on_unavailable = true`. + - **Budget gate uses character-count heuristic, not tokenizer**: + `estimate_cost_usd(chars, model)` divides chars by 4 to get + a rough token count and multiplies by a per-model price + lookup. Accurate to ~20% — well inside what a $0.10 budget + cap cares about. A real tokenizer is a future change if + drift surfaces. + - **Cost telemetry as function attribute, not module global**: + `_faithfulness_telemetry()` stores the counter dict on its + own `_state` attribute so it's resettable, mockable, and + doesn't leak module-level state. Mirrors how the polish + cache exposes its store. + - **Calibration deferred**: tasks 3.3 and 3.4 require a real + LLM run against the ops-dashboard pre-fix and post-fix + fixtures. The placeholder threshold of `0.95` ships as the + default and the calibration is scheduled to land alongside + the live-LLM Phase 2 acceptance run so a single real-API + cycle covers both phases' open work. diff --git a/docs/specs/polish-fact-check/tasks.md b/docs/specs/polish-fact-check/tasks.md index 6ca3c76..705314d 100644 --- a/docs/specs/polish-fact-check/tasks.md +++ b/docs/specs/polish-fact-check/tasks.md @@ -110,23 +110,22 @@ the `FactCheckReport` plumbing. | # | Task | Layer | Status | Notes | |---|------|-------|--------|-------| -| 3.1 | Add faithfulness-threshold + budget-cap config to `[tool.attune-author.fact-check]` | attune-author | todo | Default threshold `0.95`; default cap `$0.10/feature` | -| 3.2 | Implement `faithfulness.judge_polished_file(polished_path, source_paths, config)` wrapper | attune-author | todo | Wraps `attune_rag.eval.faithfulness.FaithfulnessJudge` | -| 3.3 | Calibrate threshold against ops-dashboard fixture | attune-author | todo | Pre-fix should score < 0.9 mean; post-fix ≥ 0.95 | -| 3.4 | Document calibration result in `decisions.md` (or design doc) | attune-author | todo | Pre-committed matrix entry; concrete numbers | -| 3.5 | Wire judge into post-polish pipeline (after Phase 1 fact-check) | attune-author | todo | Append `## Faithfulness review` block when below threshold | -| 3.6 | Cost telemetry: aggregate per-feature judge cost; report at end of `regenerate` | attune-author | todo | Use existing telemetry hooks if any; otherwise log | -| 3.7 | Test: judge runs and writes review block on a deliberately unfaithful synthetic input | attune-author | todo | Construct a polished file that contradicts the source | -| 3.8 | Test: budget cap skips judge call when estimated cost exceeds threshold | attune-author | todo | | -| 3.9 | Update CHANGELOG + README | attune-author | todo | | +| 3.1 | Add faithfulness-threshold + budget-cap config to `[tool.attune-author.fact-check]` | attune-author | **done** | `[tool.attune-author.fact-check.faithfulness]` sub-table; defaults threshold=0.95, budget=$0.10, model=Sonnet 4.6, enabled=False (opt-in) | +| 3.2 | Implement `faithfulness.judge_polished_file(polished_path, source_paths, config)` wrapper | attune-author | **done** | Wraps `FaithfulnessJudge` via `asyncio.run`; best-effort: missing extra / missing API key / over-budget all return `JudgeOutcome(score=None, skipped_reason=…)` rather than raising. `block_polish_on_unavailable=True` opt-in for strict CI. | +| 3.3 | Calibrate threshold against ops-dashboard fixture | attune-author | deferred | Requires real-LLM run; placeholder default `0.95` documented in decisions.md as pre-calibration. Calibration scheduled alongside live-LLM Phase 2 acceptance run. | +| 3.4 | Document calibration result in `decisions.md` (or design doc) | attune-author | deferred | Empty calibration record retained; will populate when 3.3 runs. | +| 3.5 | Wire judge into post-polish pipeline (after Phase 1 fact-check) | attune-author | **done** | `generator._run_faithfulness_judge` called after `_run_fact_check`; appends `## Faithfulness review` block when below threshold. `ATTUNE_AUTHOR_FAITHFULNESS=off` env override. | +| 3.6 | Cost telemetry: aggregate per-feature judge cost; report at end of `regenerate` | attune-author | **done** | Per-process telemetry state on `_faithfulness_telemetry`; `run_maintenance` resets at start and logs INFO summary at end (calls, skipped, total estimated $). | +| 3.7 | Test: judge runs and writes review block on a deliberately unfaithful synthetic input | attune-author | **done** | `test_pipeline_wiring.py::test_run_faithfulness_judge_appends_review_block_when_below_threshold` + `test_judge.py::test_judge_below_threshold_flags_threshold_not_met` | +| 3.8 | Test: budget cap skips judge call when estimated cost exceeds threshold | attune-author | **done** | `test_judge.py::test_judge_skipped_when_over_budget` | +| 3.9 | Update CHANGELOG + README | attune-author | **done** | CHANGELOG under Unreleased; README adds a "Faithfulness review (Phase 3)" subsection. | ### Phase 3 exit checklist -- [ ] Tasks 3.1–3.9 done -- [ ] Calibration shows clean separation between pre-fix and post-fix - fixture scores -- [ ] Threshold + cap configurable -- [ ] Spec status updated +- [x] Tasks 3.1, 3.2, 3.5–3.9 done (30 new tests) +- [x] Threshold + cap configurable +- [x] Spec status updated +- [ ] Calibration (tasks 3.3, 3.4) — deferred until real-LLM run lands; placeholder default `threshold=0.95` documented in `decisions.md` --- diff --git a/src/attune_author/faithfulness/__init__.py b/src/attune_author/faithfulness/__init__.py new file mode 100644 index 0000000..f4f2f71 --- /dev/null +++ b/src/attune_author/faithfulness/__init__.py @@ -0,0 +1,323 @@ +"""Faithfulness judge integration for polished docs. + +Phase 3 of the polish-fact-check spec +(``docs/specs/polish-fact-check``). Wraps +:class:`attune_rag.eval.faithfulness.FaithfulnessJudge` as a +post-polish step: score the polished file's claims against the +source files it was generated from. When the score falls below +the configured threshold, append a ``## Faithfulness review`` +block to the polished file so a human can review. + +The judge is best-effort: + +- Returns ``None`` if ``attune-rag[claude]`` isn't installed. +- Returns ``None`` if the active venv has no ``ANTHROPIC_API_KEY``. +- Returns ``None`` when the estimated cost would exceed the + configured per-file budget cap. +- Catches transient API/network failures and returns ``None`` + rather than failing the polish. + +A ``None`` result skips the review block. The polish itself is +never blocked on the judge. +""" + +from __future__ import annotations + +import logging +import os +from dataclasses import dataclass +from pathlib import Path + +logger = logging.getLogger(__name__) + + +#: Approximate cost per 1K input tokens for the judge model +#: (Haiku 4.5 default). The dict keys match the substrings we +#: search for in the model id; first match wins. +_COST_PER_1K_INPUT_TOKENS: dict[str, float] = { + "haiku": 0.001, + "sonnet": 0.003, + "opus": 0.015, +} + +#: Approximate output-token cost. Output is typically ~1/5 of +#: input for the judge but we cap the estimate at a quarter so +#: the budget gate is conservative. +_COST_PER_1K_OUTPUT_TOKENS: dict[str, float] = { + "haiku": 0.005, + "sonnet": 0.015, + "opus": 0.075, +} + +#: Rough character-per-token estimate used to convert sizes to +#: tokens without invoking a tokenizer. Accurate to within ~20% +#: for English prose — well inside what the budget gate cares +#: about. +_CHARS_PER_TOKEN = 4 + + +@dataclass +class FaithfulnessConfig: + """Faithfulness-judge configuration. + + Attributes: + enabled: Master switch. ``False`` skips the judge entirely. + threshold: Score below this triggers a review block. Range + ``[0.0, 1.0]``. Default is the spec's pre-calibration + placeholder; tune via ``[tool.attune-author.fact-check]``. + budget_per_file_usd: Skip the judge call when the estimated + cost (input + output tokens × model price) would exceed + this. Default ``$0.10`` matches the spec. + model: Judge model name. Defaults to the attune-rag default + (Sonnet 4.6); Haiku 4.5 is roughly 1/3 the cost and + usable for high-volume runs. + block_polish_on_unavailable: When ``True``, missing + ``attune-rag[claude]`` or missing API key raises rather + than degrading silently. Default ``False`` — the judge is + best-effort. + """ + + enabled: bool = False + threshold: float = 0.95 + budget_per_file_usd: float = 0.10 + model: str = "claude-sonnet-4-6" + block_polish_on_unavailable: bool = False + + +@dataclass +class JudgeOutcome: + """Outcome of a single judge call. + + ``score`` is ``None`` for skipped runs (disabled, unavailable, + over-budget) so callers can disambiguate "judge ran, all claims + supported" from "judge didn't run". + """ + + score: float | None + threshold_met: bool + cost_estimate_usd: float + supported_claims: list[str] + unsupported_claims: list[str] + reasoning: str + skipped_reason: str | None = None + + +def estimate_cost_usd(input_chars: int, model: str, *, output_chars: int = 2000) -> float: + """Estimate the judge-call cost from input/output character sizes. + + Uses a coarse chars-per-token approximation and a per-model + price lookup. The default ``output_chars`` matches the judge's + typical reply length. Pure: no network, no I/O. + """ + model_lower = model.lower() + input_rate = next( + (rate for key, rate in _COST_PER_1K_INPUT_TOKENS.items() if key in model_lower), + _COST_PER_1K_INPUT_TOKENS["sonnet"], + ) + output_rate = next( + (rate for key, rate in _COST_PER_1K_OUTPUT_TOKENS.items() if key in model_lower), + _COST_PER_1K_OUTPUT_TOKENS["sonnet"], + ) + input_tokens = input_chars / _CHARS_PER_TOKEN + output_tokens = output_chars / _CHARS_PER_TOKEN + return (input_tokens / 1000.0) * input_rate + (output_tokens / 1000.0) * output_rate + + +def judge_polished_file( + polished_path: Path, + source_paths: list[Path], + *, + config: FaithfulnessConfig, +) -> JudgeOutcome: + """Run the faithfulness judge against a single polished file. + + Args: + polished_path: Polished markdown file written by the polish pass. + source_paths: Source ``.py`` files the polish had as context. + Concatenated and passed as the judge's "passages" argument. + config: Resolved faithfulness configuration. + + Returns: + A :class:`JudgeOutcome`. ``score is None`` indicates the judge + was skipped — see ``skipped_reason``. + + Raises: + RuntimeError: When ``config.block_polish_on_unavailable`` is + True and the judge can't run (missing extra or key). + """ + if not config.enabled: + return _skipped("disabled") + + answer = "" + try: + answer = polished_path.read_text(encoding="utf-8") + except OSError as exc: + logger.warning("faithfulness: cannot read %s (%s)", polished_path, exc) + return _skipped("polished file unreadable") + + passages: list[str] = [] + for src in source_paths: + try: + passages.append(src.read_text(encoding="utf-8")) + except OSError as exc: + logger.debug("faithfulness: skip source %s (%s)", src, exc) + continue + + if not passages: + return _skipped("no readable source passages") + + total_chars = len(answer) + sum(len(p) for p in passages) + cost_est = estimate_cost_usd(total_chars, config.model) + if cost_est > config.budget_per_file_usd: + logger.info( + "faithfulness: skipping %s (estimated $%.4f > budget $%.4f)", + polished_path.name, + cost_est, + config.budget_per_file_usd, + ) + return JudgeOutcome( + score=None, + threshold_met=True, + cost_estimate_usd=cost_est, + supported_claims=[], + unsupported_claims=[], + reasoning="", + skipped_reason="over-budget", + ) + + try: + from attune_rag.eval.faithfulness import FaithfulnessJudge + except ImportError as exc: + message = ( + "attune-rag[claude] not installed; " "install with: pip install 'attune-rag[claude]'" + ) + if config.block_polish_on_unavailable: + raise RuntimeError(message) from exc + logger.info("faithfulness: %s", message) + return JudgeOutcome( + score=None, + threshold_met=True, + cost_estimate_usd=cost_est, + supported_claims=[], + unsupported_claims=[], + reasoning="", + skipped_reason="attune-rag[claude] not installed", + ) + + if not os.environ.get("ANTHROPIC_API_KEY"): + if config.block_polish_on_unavailable: + raise RuntimeError("ANTHROPIC_API_KEY not set; faithfulness judge cannot run.") + logger.info("faithfulness: ANTHROPIC_API_KEY not set, skipping judge.") + return JudgeOutcome( + score=None, + threshold_met=True, + cost_estimate_usd=cost_est, + supported_claims=[], + unsupported_claims=[], + reasoning="", + skipped_reason="ANTHROPIC_API_KEY missing", + ) + + import asyncio + + try: + judge = FaithfulnessJudge(model=config.model) + result = asyncio.run( + judge.score( + query=f"Documentation for {polished_path.stem}", + answer=answer, + passages=passages, + ) + ) + except Exception as exc: # noqa: BLE001 + # INTENTIONAL: best-effort. Network errors, SDK errors, or + # transient API failures should not block the polish. + logger.warning("faithfulness: judge call failed (%s)", exc) + return JudgeOutcome( + score=None, + threshold_met=True, + cost_estimate_usd=cost_est, + supported_claims=[], + unsupported_claims=[], + reasoning="", + skipped_reason=f"judge call failed: {exc!s}"[:200], + ) + + threshold_met = result.score >= config.threshold + return JudgeOutcome( + score=result.score, + threshold_met=threshold_met, + cost_estimate_usd=cost_est, + supported_claims=list(result.supported_claims), + unsupported_claims=list(result.unsupported_claims), + reasoning=result.reasoning, + ) + + +def format_review_block(outcome: JudgeOutcome, threshold: float) -> str: + """Render a ``## Faithfulness review`` block for soft-fail output. + + The block matches the Phase 1 ``## Unresolved references`` shape + so editors find both with the same scan. + """ + if outcome.score is None: + return "" + lines = [ + "## Faithfulness review", + "", + f"> Auto-generated by attune-author faithfulness judge. " + f"Score {outcome.score:.2f} fell below the configured " + f"threshold of {threshold:.2f}. Review unsupported claims " + f"and either fix the source code or fix this doc.", + "", + f"**Score:** {outcome.score:.2f} " + f"(supported: {len(outcome.supported_claims)}, " + f"unsupported: {len(outcome.unsupported_claims)})", + ] + if outcome.unsupported_claims: + lines.extend(["", "### Unsupported claims", ""]) + for claim in outcome.unsupported_claims: + lines.append(f"- {claim}") + if outcome.reasoning: + lines.extend(["", "### Reasoning", "", outcome.reasoning.strip()]) + return "\n".join(lines) + + +def apply_review_block(polished_path: Path, block: str) -> bool: + """Append the review block to ``polished_path``. + + Returns True if a block was appended, False if the block was + empty. Matches the Phase 1 ``apply_soft_fail`` contract. + """ + if not block: + return False + existing = polished_path.read_text(encoding="utf-8") + if not existing.endswith("\n"): + existing += "\n" + polished_path.write_text(existing + block + "\n", encoding="utf-8") + return True + + +def _skipped(reason: str) -> JudgeOutcome: + return JudgeOutcome( + score=None, + threshold_met=True, + cost_estimate_usd=0.0, + supported_claims=[], + unsupported_claims=[], + reasoning="", + skipped_reason=reason, + ) + + +from .config import load_config # noqa: E402 - re-export, must come after dataclasses + +__all__ = [ + "FaithfulnessConfig", + "JudgeOutcome", + "apply_review_block", + "estimate_cost_usd", + "format_review_block", + "judge_polished_file", + "load_config", +] diff --git a/src/attune_author/faithfulness/config.py b/src/attune_author/faithfulness/config.py new file mode 100644 index 0000000..191c4b3 --- /dev/null +++ b/src/attune_author/faithfulness/config.py @@ -0,0 +1,70 @@ +"""Load faithfulness-judge configuration from ``pyproject.toml``. + +Reads the ``[tool.attune-author.fact-check.faithfulness]`` +sub-table. Defaults match :class:`FaithfulnessConfig`'s defaults +(enabled=False — opt-in only since the judge makes real API +calls). +""" + +from __future__ import annotations + +from pathlib import Path + +from . import FaithfulnessConfig + + +def _read_toml(path: Path) -> dict[str, object]: + if not path.is_file(): + return {} + try: + import tomllib + except ImportError: # pragma: no cover - Py <3.11 fallback + import tomli as tomllib # type: ignore[import-not-found,no-redef] + try: + return tomllib.loads(path.read_text(encoding="utf-8")) + except (OSError, ValueError): + return {} + + +def load_config(project_root: Path) -> FaithfulnessConfig: + """Build a :class:`FaithfulnessConfig` from the project's pyproject. + + Reads ``[tool.attune-author.fact-check.faithfulness]`` so the + faithfulness section sits alongside (and inside) the existing + fact-check table. Unknown keys are ignored so adjacent phases + can grow their own sub-tables. + """ + data = _read_toml(project_root / "pyproject.toml") + tool = data.get("tool", {}) if isinstance(data, dict) else {} + author = tool.get("attune-author", {}) if isinstance(tool, dict) else {} + fact_check = author.get("fact-check", {}) if isinstance(author, dict) else {} + section = fact_check.get("faithfulness", {}) if isinstance(fact_check, dict) else {} + + if not isinstance(section, dict): + return FaithfulnessConfig() + + def _bool(key: str, default: bool) -> bool: + value = section.get(key, default) + return bool(value) + + def _float(key: str, default: float) -> float: + value = section.get(key, default) + try: + return float(value) + except (TypeError, ValueError): + return default + + def _str(key: str, default: str) -> str: + value = section.get(key, default) + return str(value) if value else default + + return FaithfulnessConfig( + enabled=_bool("enabled", False), + threshold=_float("threshold", 0.95), + budget_per_file_usd=_float("budget_per_file_usd", 0.10), + model=_str("model", "claude-sonnet-4-6"), + block_polish_on_unavailable=_bool("block_polish_on_unavailable", False), + ) + + +__all__ = ["load_config"] diff --git a/src/attune_author/generator.py b/src/attune_author/generator.py index 7394dc6..ee30e69 100644 --- a/src/attune_author/generator.py +++ b/src/attune_author/generator.py @@ -459,10 +459,16 @@ def apply_polish_results( source_hash=prep.source_hash, matched_files=list(prep.matched_files), ) + # Resolve the project root once so the faithfulness gate can read + # source files. cwd is the existing convention used by + # _run_fact_check; matched_files are stored relative to it. + project_root = Path.cwd() + absolute_sources = [project_root / rel_path for rel_path in prep.matched_files] for entry in prep.pending: final_content = polished_by_depth.get(entry.depth, entry.rendered_content) entry.out_path.write_text(final_content, encoding="utf-8") _run_fact_check(entry.out_path) + _run_faithfulness_judge(entry.out_path, absolute_sources, project_root) result.templates.append( GeneratedTemplate( feature=feature.name, @@ -518,6 +524,90 @@ def _run_fact_check(polished_path: Path) -> None: logger.warning("Could not append fact-check block to %s: %s", polished_path, exc) +def _faithfulness_telemetry() -> dict[str, float]: + """Per-process aggregate of faithfulness cost + call count. + + Stored on the function as a function attribute so the polite + "logger.info at end of regen" hook can read totals without + introducing module-level state. Reset via + :func:`reset_faithfulness_telemetry`. + """ + state = getattr(_faithfulness_telemetry, "_state", None) + if state is None: + state = {"calls": 0.0, "skipped": 0.0, "cost_usd": 0.0} + _faithfulness_telemetry._state = state # type: ignore[attr-defined] + return state + + +def reset_faithfulness_telemetry() -> None: + """Reset the per-process faithfulness telemetry counters.""" + _faithfulness_telemetry._state = { # type: ignore[attr-defined] + "calls": 0.0, + "skipped": 0.0, + "cost_usd": 0.0, + } + + +def _run_faithfulness_judge( + polished_path: Path, + source_paths: list[Path], + project_root: Path, +) -> None: + """Run the Phase 3 faithfulness judge against a freshly-written file. + + Best-effort: any failure inside the judge (missing extra, missing + API key, transient network error) degrades silently — the polish + pipeline is never blocked. When the judge runs and the score + falls below the configured threshold, a ``## Faithfulness review`` + block is appended to the polished file. + + Mode override via the ``ATTUNE_AUTHOR_FAITHFULNESS`` env var + (``off`` disables; any other value defers to pyproject config). + """ + if os.environ.get("ATTUNE_AUTHOR_FAITHFULNESS", "").lower() == "off": + return + + try: + from attune_author.faithfulness import ( + apply_review_block, + format_review_block, + judge_polished_file, + load_config, + ) + + config = load_config(project_root) + if not config.enabled: + return + + outcome = judge_polished_file(polished_path, source_paths, config=config) + except Exception as exc: # noqa: BLE001 + # INTENTIONAL: opportunistic — judge layer must never break + # the polish pipeline. Same contract as the fact-check gate. + logger.warning("Faithfulness judge skipped for %s: %s", polished_path, exc) + return + + telemetry = _faithfulness_telemetry() + if outcome.score is None: + telemetry["skipped"] += 1 + return + + telemetry["calls"] += 1 + telemetry["cost_usd"] += outcome.cost_estimate_usd + + if outcome.threshold_met: + return + + block = format_review_block(outcome, config.threshold) + try: + apply_review_block(polished_path, block) + except OSError as exc: + logger.warning( + "Could not append faithfulness review block to %s: %s", + polished_path, + exc, + ) + + def _maybe_polish( content: str, feature: Feature, diff --git a/src/attune_author/maintenance.py b/src/attune_author/maintenance.py index 10c0345..13788b5 100644 --- a/src/attune_author/maintenance.py +++ b/src/attune_author/maintenance.py @@ -99,6 +99,12 @@ def run_maintenance( if dry_run or report.stale_count == 0: return result + # Reset Phase 3 faithfulness telemetry so the end-of-run summary + # reflects this regen rather than carrying state across runs. + from attune_author.generator import reset_faithfulness_telemetry + + reset_faithfulness_telemetry() + for entry in report.help_entries: if not entry.is_stale: continue @@ -127,6 +133,20 @@ def run_maintenance( ) result.failed.append(entry.feature) + # Phase 3 telemetry summary. Logged at INFO so it appears in the + # default `attune-author regenerate` output. Silent when the judge + # didn't run at all (disabled or never reached). + from attune_author.generator import _faithfulness_telemetry + + telemetry = _faithfulness_telemetry() + if telemetry["calls"] or telemetry["skipped"]: + logger.info( + "Faithfulness judge: %d call(s), %d skipped, estimated cost $%.4f", + int(telemetry["calls"]), + int(telemetry["skipped"]), + telemetry["cost_usd"], + ) + return result diff --git a/tests/unit/faithfulness/__init__.py b/tests/unit/faithfulness/__init__.py new file mode 100644 index 0000000..79aa7b9 --- /dev/null +++ b/tests/unit/faithfulness/__init__.py @@ -0,0 +1 @@ +"""Unit tests for the faithfulness-judge integration.""" diff --git a/tests/unit/faithfulness/test_config.py b/tests/unit/faithfulness/test_config.py new file mode 100644 index 0000000..7686aa7 --- /dev/null +++ b/tests/unit/faithfulness/test_config.py @@ -0,0 +1,83 @@ +"""Tests for the faithfulness config loader.""" + +from __future__ import annotations + +from pathlib import Path +from textwrap import dedent + +from attune_author.faithfulness import FaithfulnessConfig, load_config + + +def _write_pyproject(tmp_path: Path, body: str) -> None: + (tmp_path / "pyproject.toml").write_text(dedent(body), encoding="utf-8") + + +def test_default_disabled_when_no_pyproject(tmp_path: Path) -> None: + cfg = load_config(tmp_path) + assert cfg == FaithfulnessConfig() + assert cfg.enabled is False + + +def test_defaults_when_section_missing(tmp_path: Path) -> None: + _write_pyproject( + tmp_path, + """ + [tool.attune-author.fact-check] + soft_fail = true + """, + ) + cfg = load_config(tmp_path) + assert cfg == FaithfulnessConfig() + + +def test_enables_via_pyproject(tmp_path: Path) -> None: + _write_pyproject( + tmp_path, + """ + [tool.attune-author.fact-check.faithfulness] + enabled = true + """, + ) + cfg = load_config(tmp_path) + assert cfg.enabled is True + + +def test_custom_threshold_and_budget(tmp_path: Path) -> None: + _write_pyproject( + tmp_path, + """ + [tool.attune-author.fact-check.faithfulness] + enabled = true + threshold = 0.8 + budget_per_file_usd = 0.25 + model = "claude-haiku-4-5" + """, + ) + cfg = load_config(tmp_path) + assert cfg.threshold == 0.8 + assert cfg.budget_per_file_usd == 0.25 + assert cfg.model == "claude-haiku-4-5" + + +def test_invalid_threshold_falls_back_to_default(tmp_path: Path) -> None: + _write_pyproject( + tmp_path, + """ + [tool.attune-author.fact-check.faithfulness] + threshold = "not a number" + """, + ) + cfg = load_config(tmp_path) + assert cfg.threshold == 0.95 + + +def test_block_polish_on_unavailable_toggle(tmp_path: Path) -> None: + _write_pyproject( + tmp_path, + """ + [tool.attune-author.fact-check.faithfulness] + block_polish_on_unavailable = true + """, + ) + cfg = load_config(tmp_path) + assert cfg.block_polish_on_unavailable is True diff --git a/tests/unit/faithfulness/test_judge.py b/tests/unit/faithfulness/test_judge.py new file mode 100644 index 0000000..77b26e2 --- /dev/null +++ b/tests/unit/faithfulness/test_judge.py @@ -0,0 +1,332 @@ +"""Tests for the faithfulness judge wrapper.""" + +from __future__ import annotations + +import sys +import types +from pathlib import Path + +import pytest + +from attune_author.faithfulness import ( + FaithfulnessConfig, + JudgeOutcome, + apply_review_block, + estimate_cost_usd, + format_review_block, + judge_polished_file, +) + +# --------------------------------------------------------------------------- +# Helpers +# --------------------------------------------------------------------------- + + +class _FakeJudgeResult: + """Stand-in for ``attune_rag.eval.faithfulness.FaithfulnessResult``.""" + + def __init__( + self, + score: float, + supported: list[str] | None = None, + unsupported: list[str] | None = None, + reasoning: str = "", + ) -> None: + self.score = score + self.supported_claims = supported or [] + self.unsupported_claims = unsupported or [] + self.reasoning = reasoning + + +def _install_fake_attune_rag(monkeypatch, judge_score: float, **kwargs): + """Install a fake ``attune_rag.eval.faithfulness`` module into sys.modules. + + The fake exposes a ``FaithfulnessJudge`` whose ``score`` coroutine + returns a ``_FakeJudgeResult`` with the given score. + """ + + class _FakeJudge: + def __init__(self, model: str = "x", **_): + self.model = model + + async def score(self, query, answer, passages): # noqa: D401 + return _FakeJudgeResult(score=judge_score, **kwargs) + + fake_module = types.ModuleType("attune_rag.eval.faithfulness") + fake_module.FaithfulnessJudge = _FakeJudge + + parent = types.ModuleType("attune_rag") + eval_pkg = types.ModuleType("attune_rag.eval") + eval_pkg.faithfulness = fake_module + parent.eval = eval_pkg + + monkeypatch.setitem(sys.modules, "attune_rag", parent) + monkeypatch.setitem(sys.modules, "attune_rag.eval", eval_pkg) + monkeypatch.setitem(sys.modules, "attune_rag.eval.faithfulness", fake_module) + monkeypatch.setenv("ANTHROPIC_API_KEY", "test-key") # pragma: allowlist secret + + +def _block_attune_rag(monkeypatch) -> None: + """Force ``import attune_rag.eval.faithfulness`` to fail with ImportError.""" + monkeypatch.setitem(sys.modules, "attune_rag.eval.faithfulness", None) + + +# --------------------------------------------------------------------------- +# estimate_cost_usd +# --------------------------------------------------------------------------- + + +def test_estimate_cost_haiku_cheaper_than_sonnet() -> None: + haiku = estimate_cost_usd(10_000, model="claude-haiku-4-5") + sonnet = estimate_cost_usd(10_000, model="claude-sonnet-4-6") + assert haiku < sonnet + + +def test_estimate_cost_scales_with_input_size() -> None: + small = estimate_cost_usd(1_000, model="claude-sonnet-4-6") + big = estimate_cost_usd(10_000, model="claude-sonnet-4-6") + assert big > small + + +def test_estimate_cost_unknown_model_falls_back_to_sonnet() -> None: + unknown = estimate_cost_usd(10_000, model="unknown-future-model") + sonnet = estimate_cost_usd(10_000, model="claude-sonnet-4-6") + assert unknown == pytest.approx(sonnet) + + +# --------------------------------------------------------------------------- +# judge_polished_file — skip paths +# --------------------------------------------------------------------------- + + +def test_judge_skipped_when_disabled(tmp_path: Path) -> None: + polished = tmp_path / "doc.md" + polished.write_text("# Hi", encoding="utf-8") + outcome = judge_polished_file(polished, [], config=FaithfulnessConfig(enabled=False)) + assert outcome.score is None + assert outcome.skipped_reason == "disabled" + assert outcome.threshold_met is True + + +def test_judge_skipped_when_polished_file_unreadable(tmp_path: Path) -> None: + missing = tmp_path / "does-not-exist.md" + outcome = judge_polished_file(missing, [], config=FaithfulnessConfig(enabled=True)) + assert outcome.score is None + assert outcome.skipped_reason == "polished file unreadable" + + +def test_judge_skipped_when_no_readable_sources(tmp_path: Path) -> None: + polished = tmp_path / "doc.md" + polished.write_text("# Hi", encoding="utf-8") + missing_src = tmp_path / "ghost.py" + outcome = judge_polished_file(polished, [missing_src], config=FaithfulnessConfig(enabled=True)) + assert outcome.score is None + assert outcome.skipped_reason == "no readable source passages" + + +def test_judge_skipped_when_over_budget(tmp_path: Path) -> None: + polished = tmp_path / "doc.md" + polished.write_text("x" * 200_000, encoding="utf-8") + src = tmp_path / "src.py" + src.write_text("y" * 200_000, encoding="utf-8") + config = FaithfulnessConfig(enabled=True, budget_per_file_usd=0.001) + outcome = judge_polished_file(polished, [src], config=config) + assert outcome.score is None + assert outcome.skipped_reason == "over-budget" + + +def test_judge_skipped_when_attune_rag_missing(tmp_path: Path, monkeypatch) -> None: + polished = tmp_path / "doc.md" + polished.write_text("# Hi", encoding="utf-8") + src = tmp_path / "src.py" + src.write_text("def x(): pass\n", encoding="utf-8") + _block_attune_rag(monkeypatch) + monkeypatch.setenv("ANTHROPIC_API_KEY", "test-key") # pragma: allowlist secret + + outcome = judge_polished_file(polished, [src], config=FaithfulnessConfig(enabled=True)) + + assert outcome.score is None + assert outcome.skipped_reason is not None + assert "attune-rag" in outcome.skipped_reason + + +def test_judge_raises_when_block_polish_on_unavailable_and_extra_missing( + tmp_path: Path, + monkeypatch, +) -> None: + polished = tmp_path / "doc.md" + polished.write_text("# Hi", encoding="utf-8") + src = tmp_path / "src.py" + src.write_text("def x(): pass\n", encoding="utf-8") + _block_attune_rag(monkeypatch) + monkeypatch.setenv("ANTHROPIC_API_KEY", "test-key") # pragma: allowlist secret + + with pytest.raises(RuntimeError, match="attune-rag"): + judge_polished_file( + polished, + [src], + config=FaithfulnessConfig(enabled=True, block_polish_on_unavailable=True), + ) + + +def test_judge_skipped_when_api_key_missing(tmp_path: Path, monkeypatch) -> None: + polished = tmp_path / "doc.md" + polished.write_text("# Hi", encoding="utf-8") + src = tmp_path / "src.py" + src.write_text("def x(): pass\n", encoding="utf-8") + _install_fake_attune_rag(monkeypatch, judge_score=1.0) + monkeypatch.delenv("ANTHROPIC_API_KEY", raising=False) + + outcome = judge_polished_file(polished, [src], config=FaithfulnessConfig(enabled=True)) + + assert outcome.score is None + assert outcome.skipped_reason == "ANTHROPIC_API_KEY missing" + + +# --------------------------------------------------------------------------- +# judge_polished_file — happy path +# --------------------------------------------------------------------------- + + +def test_judge_runs_and_records_supported_claims(tmp_path: Path, monkeypatch) -> None: + polished = tmp_path / "doc.md" + polished.write_text("Polished content.", encoding="utf-8") + src = tmp_path / "src.py" + src.write_text("def real(): pass\n", encoding="utf-8") + + _install_fake_attune_rag( + monkeypatch, + judge_score=0.97, + supported=["claim a"], + unsupported=[], + reasoning="all good", + ) + + outcome = judge_polished_file( + polished, + [src], + config=FaithfulnessConfig(enabled=True, threshold=0.95), + ) + + assert outcome.score == pytest.approx(0.97) + assert outcome.threshold_met is True + assert outcome.supported_claims == ["claim a"] + + +def test_judge_below_threshold_flags_threshold_not_met( + tmp_path: Path, + monkeypatch, +) -> None: + polished = tmp_path / "doc.md" + polished.write_text("Polished content.", encoding="utf-8") + src = tmp_path / "src.py" + src.write_text("def real(): pass\n", encoding="utf-8") + + _install_fake_attune_rag( + monkeypatch, + judge_score=0.6, + supported=["a"], + unsupported=["b", "c"], + reasoning="some drift", + ) + + outcome = judge_polished_file( + polished, + [src], + config=FaithfulnessConfig(enabled=True, threshold=0.95), + ) + + assert outcome.score == pytest.approx(0.6) + assert outcome.threshold_met is False + assert outcome.unsupported_claims == ["b", "c"] + + +def test_judge_handles_transient_call_failure(tmp_path: Path, monkeypatch) -> None: + polished = tmp_path / "doc.md" + polished.write_text("Polished content.", encoding="utf-8") + src = tmp_path / "src.py" + src.write_text("def real(): pass\n", encoding="utf-8") + + fake_module = types.ModuleType("attune_rag.eval.faithfulness") + + class _BadJudge: + def __init__(self, **_): ... + + async def score(self, query, answer, passages): + raise RuntimeError("transient API failure") + + fake_module.FaithfulnessJudge = _BadJudge + parent = types.ModuleType("attune_rag") + eval_pkg = types.ModuleType("attune_rag.eval") + eval_pkg.faithfulness = fake_module + parent.eval = eval_pkg + + monkeypatch.setitem(sys.modules, "attune_rag", parent) + monkeypatch.setitem(sys.modules, "attune_rag.eval", eval_pkg) + monkeypatch.setitem(sys.modules, "attune_rag.eval.faithfulness", fake_module) + monkeypatch.setenv("ANTHROPIC_API_KEY", "test-key") # pragma: allowlist secret + + outcome = judge_polished_file(polished, [src], config=FaithfulnessConfig(enabled=True)) + + assert outcome.score is None + assert outcome.skipped_reason is not None + assert "judge call failed" in outcome.skipped_reason + + +# --------------------------------------------------------------------------- +# format_review_block + apply_review_block +# --------------------------------------------------------------------------- + + +def test_format_review_block_returns_empty_when_score_is_none() -> None: + outcome = JudgeOutcome( + score=None, + threshold_met=True, + cost_estimate_usd=0.0, + supported_claims=[], + unsupported_claims=[], + reasoning="", + ) + assert format_review_block(outcome, threshold=0.95) == "" + + +def test_format_review_block_lists_unsupported_claims() -> None: + outcome = JudgeOutcome( + score=0.5, + threshold_met=False, + cost_estimate_usd=0.01, + supported_claims=["good"], + unsupported_claims=["bad-1", "bad-2"], + reasoning="explanation", + ) + block = format_review_block(outcome, threshold=0.95) + assert "## Faithfulness review" in block + assert "Score 0.50" in block or "Score** 0.50" in block + assert "bad-1" in block + assert "bad-2" in block + assert "explanation" in block + + +def test_apply_review_block_appends_block(tmp_path: Path) -> None: + polished = tmp_path / "doc.md" + polished.write_text("# Original\n\nBody.", encoding="utf-8") + outcome = JudgeOutcome( + score=0.5, + threshold_met=False, + cost_estimate_usd=0.01, + supported_claims=[], + unsupported_claims=["x"], + reasoning="r", + ) + block = format_review_block(outcome, threshold=0.95) + appended = apply_review_block(polished, block) + assert appended is True + final = polished.read_text(encoding="utf-8") + assert "# Original" in final + assert "## Faithfulness review" in final + + +def test_apply_review_block_returns_false_on_empty(tmp_path: Path) -> None: + polished = tmp_path / "doc.md" + polished.write_text("# Original", encoding="utf-8") + assert apply_review_block(polished, "") is False diff --git a/tests/unit/faithfulness/test_pipeline_wiring.py b/tests/unit/faithfulness/test_pipeline_wiring.py new file mode 100644 index 0000000..3821c8f --- /dev/null +++ b/tests/unit/faithfulness/test_pipeline_wiring.py @@ -0,0 +1,134 @@ +"""Tests for the generator <-> faithfulness wiring + telemetry.""" + +from __future__ import annotations + +from pathlib import Path +from unittest.mock import patch + +from attune_author.faithfulness import JudgeOutcome +from attune_author.generator import ( + _faithfulness_telemetry, + _run_faithfulness_judge, + reset_faithfulness_telemetry, +) + + +def _outcome( + score: float | None, *, threshold_met: bool, skipped: str | None = None +) -> JudgeOutcome: + return JudgeOutcome( + score=score, + threshold_met=threshold_met, + cost_estimate_usd=0.0123, + supported_claims=[], + unsupported_claims=["bad"] if score is not None and not threshold_met else [], + reasoning="r", + skipped_reason=skipped, + ) + + +def setup_function() -> None: + reset_faithfulness_telemetry() + + +def test_run_faithfulness_judge_disabled_via_env(tmp_path: Path, monkeypatch) -> None: + """ATTUNE_AUTHOR_FAITHFULNESS=off short-circuits before doing any work.""" + monkeypatch.setenv("ATTUNE_AUTHOR_FAITHFULNESS", "off") + polished = tmp_path / "x.md" + polished.write_text("# x\n", encoding="utf-8") + + with patch("attune_author.faithfulness.judge_polished_file") as mock_judge: + _run_faithfulness_judge(polished, [], tmp_path) + + mock_judge.assert_not_called() + + +def test_run_faithfulness_judge_disabled_via_config(tmp_path: Path) -> None: + """No pyproject.toml -> config.enabled defaults to False -> no judge call.""" + polished = tmp_path / "x.md" + polished.write_text("# x\n", encoding="utf-8") + + with patch("attune_author.faithfulness.judge_polished_file") as mock_judge: + _run_faithfulness_judge(polished, [], tmp_path) + + mock_judge.assert_not_called() + + +def _enable_via_pyproject(tmp_path: Path) -> None: + (tmp_path / "pyproject.toml").write_text( + "[tool.attune-author.fact-check.faithfulness]\nenabled = true\n", + encoding="utf-8", + ) + + +def test_run_faithfulness_judge_telemetry_on_success(tmp_path: Path) -> None: + _enable_via_pyproject(tmp_path) + polished = tmp_path / "x.md" + polished.write_text("# x\n", encoding="utf-8") + + outcome = _outcome(0.97, threshold_met=True) + with patch("attune_author.faithfulness.judge_polished_file", return_value=outcome): + _run_faithfulness_judge(polished, [], tmp_path) + + telemetry = _faithfulness_telemetry() + assert telemetry["calls"] == 1 + assert telemetry["skipped"] == 0 + assert telemetry["cost_usd"] == 0.0123 + + +def test_run_faithfulness_judge_telemetry_on_skip(tmp_path: Path) -> None: + _enable_via_pyproject(tmp_path) + polished = tmp_path / "x.md" + polished.write_text("# x\n", encoding="utf-8") + + outcome = _outcome(None, threshold_met=True, skipped="over-budget") + with patch("attune_author.faithfulness.judge_polished_file", return_value=outcome): + _run_faithfulness_judge(polished, [], tmp_path) + + telemetry = _faithfulness_telemetry() + assert telemetry["calls"] == 0 + assert telemetry["skipped"] == 1 + assert telemetry["cost_usd"] == 0 + + +def test_run_faithfulness_judge_appends_review_block_when_below_threshold( + tmp_path: Path, +) -> None: + _enable_via_pyproject(tmp_path) + polished = tmp_path / "x.md" + polished.write_text("# Original\n", encoding="utf-8") + + outcome = _outcome(0.5, threshold_met=False) + with patch("attune_author.faithfulness.judge_polished_file", return_value=outcome): + _run_faithfulness_judge(polished, [], tmp_path) + + final = polished.read_text(encoding="utf-8") + assert "## Faithfulness review" in final + assert "bad" in final + + +def test_run_faithfulness_judge_swallows_unexpected_exceptions(tmp_path: Path) -> None: + """A buggy judge layer must never break the polish pipeline.""" + _enable_via_pyproject(tmp_path) + polished = tmp_path / "x.md" + polished.write_text("# Original\n", encoding="utf-8") + + with patch( + "attune_author.faithfulness.judge_polished_file", + side_effect=RuntimeError("kaboom"), + ): + # Should not raise. + _run_faithfulness_judge(polished, [], tmp_path) + + +def test_reset_faithfulness_telemetry_zeros_counters(tmp_path: Path) -> None: + _enable_via_pyproject(tmp_path) + polished = tmp_path / "x.md" + polished.write_text("# x\n", encoding="utf-8") + outcome = _outcome(0.97, threshold_met=True) + with patch("attune_author.faithfulness.judge_polished_file", return_value=outcome): + _run_faithfulness_judge(polished, [], tmp_path) + + reset_faithfulness_telemetry() + telemetry = _faithfulness_telemetry() + assert telemetry == {"calls": 0.0, "skipped": 0.0, "cost_usd": 0.0}