refactor: centralize flip detection and add comprehensive tests for diff command

NullPointerDepressiveDisorder · NullPointerDepressiveDisorder · commit af177821eaa0 · 2026-04-15T19:16:18.000-07:00
- Extract answer extraction and flip detection logic into a new `_annotate_flip_metadata` helper method in `TestRunner` to eliminate code duplication between `compare` and `diff`.
- Add unit tests for the `diff` CLI command (`test_cli_diff.py`) to verify summary table rendering, backend metrics, and flip rate formatting.
- Introduce `StubBackend` in `test_runner.py` and add async tests to ensure the `diff` runner method accurately detects answer flips across single and multiple test backends.
diff --git a/src/infer_check/runner.py b/src/infer_check/runner.py
@@ -27,6 +27,32 @@ def __init__(self, cache_dir: str | Path = ".infer_check_cache"):
         self.cache_dir = Path(cache_dir)
         self.cache_dir.mkdir(parents=True, exist_ok=True)
 
+    def _annotate_flip_metadata(
+        self,
+        comp: ComparisonResult,
+        text_a: str,
+        text_b: str,
+        category: str,
+    ) -> None:
+        """Extract functional answers and check for flips, updating comparison metadata."""
+        from infer_check.analysis.answer_extract import (
+            answers_match,
+            extract_answer,
+        )
+
+        ans_a = extract_answer(text_a, category)
+        ans_b = extract_answer(text_b, category)
+        flipped = not answers_match(ans_a, ans_b)
+
+        comp.metadata["flipped"] = flipped
+        comp.metadata["answer_a"] = ans_a.value
+        comp.metadata["answer_b"] = ans_b.value
+        comp.metadata["extraction_strategy"] = ans_a.strategy
+        comp.metadata["extraction_confidence"] = min(
+            ans_a.confidence,
+            ans_b.confidence,
+        )
+
     def _save_checkpoint(self, results: Any, path: Path) -> None:
         """Write intermediate results as JSON for resumability."""
         path.parent.mkdir(parents=True, exist_ok=True)
@@ -401,30 +427,12 @@ async def compare(
                 progress.advance(task)
 
         # ── Build comparisons with answer extraction ────────────────
-        from infer_check.analysis.answer_extract import (
-            answers_match,
-            extract_answer,
-        )
-
         for prompt in prompts:
             a = results_a.get(prompt.id)
             b = results_b.get(prompt.id)
             if a and b:
                 comp = self._compare(a, b)
-
-                # Extract functional answers and check for flips.
-                ans_a = extract_answer(a.text, prompt.category)
-                ans_b = extract_answer(b.text, prompt.category)
-                flipped = not answers_match(ans_a, ans_b)
-
-                comp.metadata["flipped"] = flipped
-                comp.metadata["answer_a"] = ans_a.value
-                comp.metadata["answer_b"] = ans_b.value
-                comp.metadata["extraction_strategy"] = ans_a.strategy
-                comp.metadata["extraction_confidence"] = min(
-                    ans_a.confidence,
-                    ans_b.confidence,
-                )
+                self._annotate_flip_metadata(comp, a.text, b.text, prompt.category)
                 comparisons.append(comp)
 
         # ── Aggregate metrics ────────────────────────────────────────
@@ -491,11 +499,6 @@ async def diff(
         prompts: list[Prompt],
     ) -> list[ComparisonResult]:
         """Compare outputs across different backends against a baseline."""
-        from infer_check.analysis.answer_extract import (
-            answers_match,
-            extract_answer,
-        )
-
         baseline_results: dict[str, InferenceResult] = {}
         comparisons: list[ComparisonResult] = []
 
@@ -538,21 +541,7 @@ async def diff(
                     baseline = baseline_results.get(test_res.prompt_id)
                     if baseline:
                         comp = self._compare(baseline, test_res)
-
-                        # Answer extraction and flip detection
-                        ans_a = extract_answer(baseline.text, prompt.category)
-                        ans_b = extract_answer(test_res.text, prompt.category)
-                        flipped = not answers_match(ans_a, ans_b)
-
-                        comp.metadata["flipped"] = flipped
-                        comp.metadata["answer_a"] = ans_a.value
-                        comp.metadata["answer_b"] = ans_b.value
-                        comp.metadata["extraction_strategy"] = ans_a.strategy
-                        comp.metadata["extraction_confidence"] = min(
-                            ans_a.confidence,
-                            ans_b.confidence,
-                        )
-
+                        self._annotate_flip_metadata(comp, baseline.text, test_res.text, prompt.category)
                         comparisons.append(comp)
                     progress.advance(task)
 
diff --git a/tests/unit/test_cli_diff.py b/tests/unit/test_cli_diff.py
@@ -0,0 +1,155 @@
+from pathlib import Path
+from typing import Any
+from unittest.mock import patch
+
+import pytest
+from click.testing import CliRunner
+
+from infer_check.cli import main
+from infer_check.types import ComparisonResult, InferenceResult
+
+
+@pytest.fixture
+def runner() -> CliRunner:
+    return CliRunner()
+
+
+def test_cli_diff_summary_table(runner: CliRunner, tmp_path: Path) -> None:
+    # Setup dummy prompts file
+    dummy_suite = tmp_path / "dummy.jsonl"
+    dummy_suite.write_text(
+        '{"id":"p1", "text":"hi", "category":"general"}\n{"id":"p2", "text":"bye", "category":"general"}'
+    )
+
+    # Mock InferenceResults
+    inf_res_baseline_1 = InferenceResult(
+        prompt_id="p1", backend_name="baseline", model_id="m", tokens=["hi"], text="hi", latency_ms=1.0
+    )
+    inf_res_test_1 = InferenceResult(
+        prompt_id="p1", backend_name="test", model_id="m", tokens=["hi"], text="hi", latency_ms=1.0
+    )
+
+    inf_res_baseline_2 = InferenceResult(
+        prompt_id="p2", backend_name="baseline", model_id="m", tokens=["bye"], text="bye", latency_ms=1.0
+    )
+    inf_res_test_2 = InferenceResult(
+        prompt_id="p2", backend_name="test", model_id="m", tokens=["hello"], text="hello", latency_ms=1.0
+    )
+
+    # Mock ComparisonResults
+    # comp1: not flipped
+    comp1 = ComparisonResult(
+        baseline=inf_res_baseline_1,
+        test=inf_res_test_1,
+        text_similarity=1.0,
+        is_failure=False,
+        metadata={"flipped": False},
+    )
+    # comp2: flipped
+    comp2 = ComparisonResult(
+        baseline=inf_res_baseline_2,
+        test=inf_res_test_2,
+        text_similarity=0.4,
+        is_failure=True,
+        metadata={"flipped": True},
+    )
+
+    with (
+        patch("infer_check.backends.base.get_backend"),
+        patch("infer_check.runner.TestRunner.diff") as mock_diff,
+    ):
+        # We need an async mock that returns the list of comparisons
+        async def mock_diff_async(*args: Any, **kwargs: Any) -> list[ComparisonResult]:
+            return [comp1, comp2]
+
+        mock_diff.side_effect = mock_diff_async
+
+        result = runner.invoke(
+            main,
+            [
+                "diff",
+                "--model",
+                "m1",
+                "--backends",
+                "mlx-lm,llama-cpp",
+                "--prompts",
+                str(dummy_suite),
+                "--output",
+                str(tmp_path),
+            ],
+        )
+
+        assert result.exit_code == 0
+
+        # Output should contain the table headers
+        assert "test_backend" in result.output
+        assert "failures" in result.output
+        assert "failure_rate" in result.output
+        assert "flip_rate" in result.output
+        assert "mean_similarity" in result.output
+
+        # Check backend name and metrics
+        assert "llama-cpp" in result.output  # Backend name used in runner.diff?
+        # Actually in cli.py it uses backend_names = [b.strip() for b in backends.split(",")]
+        # and it pads it. In my mock, the Comparisons results have backend_name from inf_res.
+        # But groups = defaultdict(list)
+        # for comp in comparisons:
+        #     groups[comp.test.backend_name].append(comp)
+
+        assert "test" in result.output  # backend_name in inf_res_test_*
+
+        # 2 prompts, 1 failure -> 50.00%
+        assert "50.00%" in result.output
+
+        # 1 flip out of 2 -> 50.0%
+        # The formatting in cli.py is f"[{'red' if flip_rate > 0.1 else 'green'}]{flip_rate:.1%}[/]"
+        # Rich markup might be stripped or present depending on how CliRunner handles it.
+        # Usually CliRunner output doesn't have the color codes unless we tell it to.
+        assert "50.0%" in result.output
+
+        # mean similarity: (1.0 + 0.4) / 2 = 0.7
+        assert "0.7000" in result.output
+
+
+def test_cli_diff_summary_no_flips(runner: CliRunner, tmp_path: Path) -> None:
+    dummy_suite = tmp_path / "dummy_no_flips.jsonl"
+    dummy_suite.write_text('{"id":"p1", "text":"hi", "category":"general"}')
+
+    inf_res_baseline = InferenceResult(
+        prompt_id="p1", backend_name="baseline", model_id="m", tokens=["hi"], text="hi", latency_ms=1.0
+    )
+    inf_res_test = InferenceResult(
+        prompt_id="p1", backend_name="test", model_id="m", tokens=["hi"], text="hi", latency_ms=1.0
+    )
+
+    comp = ComparisonResult(
+        baseline=inf_res_baseline, test=inf_res_test, text_similarity=1.0, is_failure=False, metadata={"flipped": False}
+    )
+
+    with (
+        patch("infer_check.backends.base.get_backend"),
+        patch("infer_check.runner.TestRunner.diff") as mock_diff,
+    ):
+
+        async def mock_diff_async(*args: Any, **kwargs: Any) -> list[ComparisonResult]:
+            return [comp]
+
+        mock_diff.side_effect = mock_diff_async
+
+        result = runner.invoke(
+            main,
+            [
+                "diff",
+                "--model",
+                "m1",
+                "--backends",
+                "mlx-lm,llama-cpp",
+                "--prompts",
+                str(dummy_suite),
+                "--output",
+                str(tmp_path),
+            ],
+        )
+
+        assert result.exit_code == 0
+        assert "0.0%" in result.output
diff --git a/tests/unit/test_runner.py b/tests/unit/test_runner.py
@@ -1,7 +1,9 @@
+import asyncio
+
 import pytest
 
 from infer_check.runner import TestRunner
-from infer_check.types import InferenceResult
+from infer_check.types import InferenceResult, Prompt
 
 
 @pytest.fixture
@@ -115,3 +117,94 @@ def test_compare_threshold_edge_cases(runner: TestRunner) -> None:
     # Since similarity is ~0.8, it should fail.
     comp_strict = runner._compare(baseline, test_res, threshold=0.9)
     assert comp_strict.is_failure is True
+
+
+class StubBackend:
+    def __init__(self, name: str, responses: dict[str, str]):
+        self._name = name
+        self._responses = responses
+        self.cleanup_called = False
+
+    @property
+    def name(self) -> str:
+        return self._name
+
+    async def generate(self, prompt: Prompt) -> InferenceResult:
+        text = self._responses.get(prompt.id, "Default response")
+        return InferenceResult(
+            prompt_id=prompt.id,
+            backend_name=self._name,
+            model_id="stub-model",
+            tokens=text.split(),
+            text=text,
+            latency_ms=1.0,
+            metadata={},
+        )
+
+    async def generate_batch(self, prompts: list[Prompt]) -> list[InferenceResult]:
+        return await asyncio.gather(*(self.generate(p) for p in prompts))
+
+    async def health_check(self) -> bool:
+        return True
+
+    async def cleanup(self) -> None:
+        self.cleanup_called = True
+
+
+@pytest.mark.asyncio
+async def test_diff_flip_detection(runner: TestRunner) -> None:
+    # Setup prompts
+    p1 = Prompt(id="p1", text="What is 2+2?", category="arithmetic")
+    p2 = Prompt(id="p2", text="What is the capital of France?", category="general")
+    prompts = [p1, p2]
+
+    # Setup backends
+    # p1: baseline says 4, test says 5 (FLIP)
+    # p2: both say Paris (NO FLIP)
+    baseline_backend = StubBackend("baseline", {"p1": "The answer is 4", "p2": "Paris"})
+    test_backend = StubBackend("test", {"p1": "The answer is 5", "p2": "Paris"})
+
+    # Run diff
+    comparisons = await runner.diff(
+        baseline_backend=baseline_backend,
+        test_backends=[test_backend],
+        prompts=prompts,
+    )
+
+    assert len(comparisons) == 2
+
+    # Check p1 (flip)
+    comp_p1 = next(c for c in comparisons if c.baseline.prompt_id == "p1")
+    assert comp_p1.metadata["flipped"] is True
+    assert comp_p1.metadata["answer_a"] == "4"
+    assert comp_p1.metadata["answer_b"] == "5"
+    assert "extraction_confidence" in comp_p1.metadata
+
+    # Check p2 (no flip)
+    comp_p2 = next(c for c in comparisons if c.baseline.prompt_id == "p2")
+    assert comp_p2.metadata["flipped"] is False
+    assert comp_p2.metadata["answer_a"].lower() == "paris"
+    assert comp_p2.metadata["answer_b"].lower() == "paris"
+    assert "extraction_confidence" in comp_p2.metadata
+
+
+@pytest.mark.asyncio
+async def test_diff_multiple_test_backends(runner: TestRunner) -> None:
+    p1 = Prompt(id="p1", text="Test", category="general")
+    prompts = [p1]
+
+    baseline = StubBackend("baseline", {"p1": "A"})
+    test1 = StubBackend("test1", {"p1": "B"})
+    test2 = StubBackend("test2", {"p1": "A"})
+
+    comparisons = await runner.diff(
+        baseline_backend=baseline,
+        test_backends=[test1, test2],
+        prompts=prompts,
+    )
+
+    assert len(comparisons) == 2
+    assert comparisons[0].test.backend_name == "test1"
+    assert comparisons[0].metadata["flipped"] is True
+    assert comparisons[1].test.backend_name == "test2"
+    assert comparisons[1].metadata["flipped"] is False