bug/kl-divergence (#9)

NullPointerDepressiveDisorder · web-flow · commit 4d0ea3eb4803 · 2026-04-01T21:12:51.000-07:00
* fix: improve logprob extraction and error handling in mlx backend, correct token ID parsing in runner

* test: add KL divergence test for rank index alignment in llama.cpp distributions

* test: add tests for MLXBackend generate fallback and double failure handling
diff --git a/src/infer_check/backends/mlx_lm.py b/src/infer_check/backends/mlx_lm.py
@@ -47,9 +47,9 @@ async def generate(self, prompt: Prompt) -> InferenceResult:
         try:
             return self._generate_with_logprobs(prompt)
         except Exception as exc:
-            import logging
+            from rich.console import Console
 
-            logging.debug("generate_step failed (%s), falling back to simple generate", exc)
+            Console().print(f"[yellow]⚠ generate_step failed, falling back to simple generate: {exc}[/yellow]")
             try:
                 return self._generate_simple(prompt)
             except Exception as inner:
@@ -184,7 +184,7 @@ def _generate_with_logprobs(self, prompt: Prompt) -> InferenceResult:
         temp = prompt.metadata.get("temperature", 0.0) if prompt.metadata else 0.0
         sampler = make_sampler(temp=temp)
         formatted = self._format_prompt(prompt.text)
-        input_ids = self._tokenizer.encode(formatted, return_tensors="mlx")
+        input_ids = mx.array(self._tokenizer.encode(formatted))
 
         # Configurable top-K to avoid memory explosion. Default to 10.
         top_k = prompt.metadata.get("top_k_logprobs", 10) if prompt.metadata else 10
@@ -225,13 +225,7 @@ def _generate_with_logprobs(self, prompt: Prompt) -> InferenceResult:
                 if effective_top_k > vocab_size:
                     effective_top_k = vocab_size
 
-                # Get top-K indices and values
-                if hasattr(mx, "topk"):
-                    # mx.topk is the most efficient way to get top-K if available.
-                    top_k_values, top_k_indices = mx.topk(logprob_dist, effective_top_k)
-                    dist_list = cast(list[float], top_k_values.tolist())
-                    dist_indices = cast(list[int], top_k_indices.tolist())
-                elif hasattr(mx, "argpartition"):
+                if hasattr(mx, "argpartition"):
                     # Fallback to argpartition which is often available in newer MLX.
                     top_k_indices = mx.argpartition(-logprob_dist, effective_top_k - 1)[:effective_top_k]
                     top_k_values = logprob_dist[top_k_indices]
@@ -264,7 +258,7 @@ def _generate_with_logprobs(self, prompt: Prompt) -> InferenceResult:
                     meta[f"id_{i}"] = int(idx)
                 distribution_metadata.append(meta)
 
-            token_id = int(token.item())
+            token_id = int(token)
             token_str = self._tokenizer.decode([token_id])
             tokens.append(token_str)
 
diff --git a/src/infer_check/runner.py b/src/infer_check/runner.py
@@ -124,8 +124,8 @@ def to_probs(dist: Any) -> Any:
                 # llama-server: id_0, id_1, ... (top-K)
                 elif b_meta and t_meta and "id_0" in b_meta and "id_0" in t_meta:
                     # Align on union of token IDs (can be int IDs or token strings)
-                    b_ids = {v: i for k, v in b_meta.items() if k.startswith("id_")}
-                    t_ids = {v: i for k, v in t_meta.items() if k.startswith("id_")}
+                    b_ids = {v: int(k.split("_")[1]) for k, v in b_meta.items() if k.startswith("id_")}
+                    t_ids = {v: int(k.split("_")[1]) for k, v in t_meta.items() if k.startswith("id_")}
 
                     # Skip if ID types are different (e.g. int vs str)
                     b_id_types = {type(v) for v in b_ids}
diff --git a/tests/unit/test_kl_alignment.py b/tests/unit/test_kl_alignment.py
@@ -72,6 +72,55 @@ def test_kl_alignment_llama_cpp() -> None:
     assert result.kl_divergence > 0
 
 
+def test_kl_alignment_llama_cpp_rank_index() -> None:
+    """Test KL computation for top-K aligned distributions using rank index from key name (id_N).
+
+    This test asserts that it's the rank index 'N' in 'id_N' that matters,
+    not the iteration order or the index in metadata dictionary.
+    """
+    import numpy as np
+
+    runner = TestRunner()
+
+    # Case: id_0 and id_1 are swapped in the metadata dictionary,
+    # but the distributions are [prob_of_id_0, prob_of_id_1].
+    # If the logic incorrectly uses the order in which keys are processed,
+    # it might swap the probabilities.
+
+    baseline = InferenceResult(
+        prompt_id="p1",
+        backend_name="b1",
+        model_id="m1",
+        text="hi",
+        tokens=["h"],
+        distributions=[[0.8, 0.2]],
+        # Swapped order in dict
+        distribution_metadata=[{"id_1": 11, "id_0": 10}],
+        latency_ms=10.0,
+    )
+
+    test = InferenceResult(
+        prompt_id="p1",
+        backend_name="b2",
+        model_id="m1",
+        text="hi",
+        tokens=["h"],
+        distributions=[[0.7, 0.3]],
+        # Normal order
+        distribution_metadata=[{"id_0": 10, "id_1": 11}],
+        latency_ms=10.0,
+    )
+
+    result = runner._compare(baseline, test)
+    assert result.kl_divergence is not None
+
+    expected_p = np.array([0.8, 0.2])
+    expected_q = np.array([0.7, 0.3])
+    expected_kl = np.sum(expected_p * np.log(expected_p / expected_q))
+
+    assert np.isclose(result.kl_divergence, expected_kl, atol=1e-5)
+
+
 def test_kl_skips_unaligned() -> None:
     """Ensure KL is None if distributions cannot be aligned."""
     runner = TestRunner()
diff --git a/tests/unit/test_mlx_backend.py b/tests/unit/test_mlx_backend.py
@@ -67,3 +67,69 @@ def test_mlx_cleanup(mock_mlx: tuple[MagicMock, MagicMock, MagicMock]) -> None:
     asyncio.run(backend.cleanup())
     assert backend._model is None
     assert backend._tokenizer is None
+
+
+@pytest.mark.asyncio
+async def test_mlx_generate_fallback(mock_mlx: tuple[MagicMock, MagicMock, MagicMock]) -> None:
+    from unittest.mock import patch
+
+    from infer_check.types import InferenceResult, Prompt
+
+    backend = MLXBackend(model_id="dummy-model")
+    backend._model = mock_mlx[1]
+    backend._tokenizer = mock_mlx[2]
+
+    prompt = Prompt(text="test prompt")
+    simple_result = InferenceResult(
+        prompt_id=prompt.id,
+        backend_name="mlx-lm",
+        model_id="dummy-model",
+        tokens=["hello"],
+        text="hello",
+        latency_ms=10.0,
+    )
+
+    with (
+        patch.object(MLXBackend, "_generate_with_logprobs") as mock_logprobs,
+        patch.object(MLXBackend, "_generate_simple") as mock_simple,
+        patch("rich.console.Console.print") as mock_print,
+    ):
+        mock_logprobs.side_effect = Exception("Logprobs failed")
+        mock_simple.return_value = simple_result
+
+        result = await backend.generate(prompt)
+
+        assert result == simple_result
+        mock_logprobs.assert_called_once_with(prompt)
+        mock_simple.assert_called_once_with(prompt)
+        mock_print.assert_called_once()
+        args, _ = mock_print.call_args
+        assert "generate_step failed, falling back to simple generate" in args[0]
+        assert "Logprobs failed" in args[0]
+
+
+@pytest.mark.asyncio
+async def test_mlx_generate_double_failure(mock_mlx: tuple[MagicMock, MagicMock, MagicMock]) -> None:
+    from unittest.mock import patch
+
+    from infer_check.types import Prompt
+
+    backend = MLXBackend(model_id="dummy-model")
+    backend._model = mock_mlx[1]
+    backend._tokenizer = mock_mlx[2]
+
+    prompt = Prompt(text="test prompt")
+
+    with (
+        patch.object(MLXBackend, "_generate_with_logprobs") as mock_logprobs,
+        patch.object(MLXBackend, "_generate_simple") as mock_simple,
+        patch("rich.console.Console.print"),
+    ):
+        mock_logprobs.side_effect = Exception("Logprobs failed")
+        mock_simple.side_effect = Exception("Simple failed")
+
+        with pytest.raises(RuntimeError) as exc_info:
+            await backend.generate(prompt)
+
+        assert "MLX generation failed" in str(exc_info.value)
+        assert "Simple failed" in str(exc_info.value)