test: add coverage for load_suite category distribution logic

NullPointerDepressiveDisorder · NullPointerDepressiveDisorder · commit 982942667492 · 2026-04-12T19:31:41.000-07:00
- Add unit tests to validate equal and uneven category prompt distribution
- Extend `load_suite` to support limiting prompts by category balance
- Update OpenAI backend to parse logprobs and token distributions
diff --git a/src/infer_check/backends/openai_compat.py b/src/infer_check/backends/openai_compat.py
@@ -65,11 +65,13 @@ async def generate(self, prompt: Prompt) -> InferenceResult:
 
     async def _generate_chat(self, prompt: Prompt) -> InferenceResult:
         """Use ``/v1/chat/completions`` with proper message formatting."""
-        payload = {
+        payload: dict[str, object] = {
             "model": self._model_id,
             "messages": [{"role": "user", "content": prompt.text}],
             "max_tokens": prompt.max_tokens,
             "temperature": prompt.metadata.get("temperature", 0.0) if prompt.metadata else 0.0,
+            "logprobs": True,
+            "top_logprobs": 5,
         }
 
         start = time.perf_counter()
@@ -103,7 +105,46 @@ async def _generate_chat(self, prompt: Prompt) -> InferenceResult:
         text: str = message.get("content", "")
         if not text:
             text = message.get("reasoning_content", "")
-        tokens = text.split()
+
+        # Parse logprobs (chat completions format) -------------------------
+        tokens: list[str] = []
+        logprobs_list: list[float] | None = None
+        distributions: list[list[float]] | None = None
+        distribution_metadata: list[dict[str, int | str]] | None = None
+
+        lp_data = choice.get("logprobs")
+        if lp_data and lp_data.get("content"):
+            content_logprobs = lp_data["content"]
+            tokens = [entry["token"] for entry in content_logprobs]
+            logprobs_list = [
+                float(entry["logprob"]) if entry.get("logprob") is not None else -9999.0 for entry in content_logprobs
+            ]
+
+            distributions = []
+            distribution_metadata = []
+            for entry in content_logprobs:
+                top = entry.get("top_logprobs", [])
+                if not top:
+                    distributions.append([])
+                    distribution_metadata.append({})
+                    continue
+                sorted_items = sorted(top, key=lambda x: x.get("token", ""))
+                cleaned: list[tuple[str, float]] = []
+                for item in sorted_items:
+                    try:
+                        fv = float(item["logprob"]) if item.get("logprob") is not None else -9999.0
+                    except (TypeError, ValueError):
+                        fv = -9999.0
+                    if math.isnan(fv):
+                        fv = -9999.0
+                    cleaned.append((item.get("token", ""), fv))
+                distributions.append([fv for _, fv in cleaned])
+                meta: dict[str, int | str] = {}
+                for i, (tok, _) in enumerate(cleaned):
+                    meta[f"id_{i}"] = tok
+                distribution_metadata.append(meta)
+        else:
+            tokens = text.split()
 
         usage = data.get("usage", {})
         completion_tokens = usage.get("completion_tokens", len(tokens))
@@ -114,7 +155,9 @@ async def _generate_chat(self, prompt: Prompt) -> InferenceResult:
             backend_name=self.name,
             model_id=self._model_id,
             tokens=tokens,
-            logprobs=None,
+            logprobs=logprobs_list,
+            distributions=distributions,
+            distribution_metadata=distribution_metadata,
             text=text,
             latency_ms=elapsed_s * 1000,
             tokens_per_second=tps,
diff --git a/src/infer_check/cli.py b/src/infer_check/cli.py
@@ -59,11 +59,7 @@ def _load_prompts(ctx: click.Context, prompts: str, max_tokens: int | None, num_
     if num_prompts is not None:
         ctx.obj["num_prompts"] = num_prompts
 
-    prompt_list = load_suite(_resolve_prompts(prompts))
-
-    # Apply num_prompts limit
-    if ctx.obj["num_prompts"] is not None:
-        prompt_list = prompt_list[: ctx.obj["num_prompts"]]
+    prompt_list = load_suite(_resolve_prompts(prompts), num_prompts=ctx.obj["num_prompts"])
 
     # Apply global max_tokens only if not explicitly set in the prompt JSONL
     for p in prompt_list:
diff --git a/src/infer_check/suites/loader.py b/src/infer_check/suites/loader.py
@@ -12,18 +12,19 @@
 console = Console()
 
 
-def load_suite(path: str | Path) -> list[Prompt]:
+def load_suite(path: str | Path, num_prompts: int | None = None) -> list[Prompt]:
     """
     Read a JSONL file and validate each line against the Prompt model.
     Logs the count and category distribution via rich.console.
-    Raises ValueError with the line number on invalid entries.
+    If num_prompts is provided, selects an approximately equal number
+    of prompts from each category.
     """
     path_obj = Path(path)
     if not path_obj.exists():
         raise FileNotFoundError(f"Prompt suite not found: {path_obj}")
 
-    prompts = []
-    category_counts: Counter[str] = Counter()
+    all_prompts: list[Prompt] = []
+    prompts_by_category: dict[str, list[Prompt]] = {}
 
     with path_obj.open("r", encoding="utf-8") as f:
         for idx, line in enumerate(f, start=1):
@@ -34,19 +35,52 @@ def load_suite(path: str | Path) -> list[Prompt]:
             try:
                 data = json.loads(line)
                 prompt = Prompt.model_validate(data)
-                prompts.append(prompt)
-                category_counts[prompt.category] += 1
+                all_prompts.append(prompt)
+                cat = prompt.category or "default"
+                if cat not in prompts_by_category:
+                    prompts_by_category[cat] = []
+                prompts_by_category[cat].append(prompt)
             except json.JSONDecodeError as e:
                 raise ValueError(f"Invalid JSON at {path_obj}:{idx} - {e}") from e
             except ValidationError as e:
                 raise ValueError(f"Invalid Prompt at {path_obj}:{idx} - {e}") from e
 
+    # Apply num_prompts limit with equal category distribution
+    if num_prompts is not None and num_prompts < len(all_prompts):
+        selected_prompts: list[Prompt] = []
+        categories = sorted(prompts_by_category.keys())
+        num_categories = len(categories)
+
+        if num_categories > 0:
+            # Simple round-robin selection to keep categories equal
+            # We iterate through categories and pick one prompt from each until we hit the limit
+            # This ensures that even if categories have different sizes, we pick as equally as possible
+            cat_indices = {cat: 0 for cat in categories}
+            while len(selected_prompts) < num_prompts:
+                added_in_round = False
+                for cat in categories:
+                    if len(selected_prompts) >= num_prompts:
+                        break
+                    idx = cat_indices[cat]
+                    if idx < len(prompts_by_category[cat]):
+                        selected_prompts.append(prompts_by_category[cat][idx])
+                        cat_indices[cat] += 1
+                        added_in_round = True
+                if not added_in_round:
+                    break
+            final_prompts = selected_prompts
+        else:
+            final_prompts = all_prompts[:num_prompts]
+    else:
+        final_prompts = all_prompts
+
     # Log summary
-    console.print(f"[bold green]Loaded {len(prompts)} prompts from {path_obj.name}[/bold green]")
+    category_counts = Counter(p.category or "default" for p in final_prompts)
+    console.print(f"[bold green]Loaded {len(final_prompts)} prompts from {path_obj.name}[/bold green]")
     for category, count in category_counts.most_common():
         console.print(f"  - {category}: {count}")
 
-    return prompts
+    return final_prompts
 
 
 def save_suite(prompts: list[Prompt], path: str | Path) -> None:
diff --git a/tests/unit/test_loader_distribution.py b/tests/unit/test_loader_distribution.py
@@ -0,0 +1,86 @@
+import json
+from pathlib import Path
+
+from infer_check.suites.loader import load_suite
+
+
+def test_load_suite_equal_distribution(tmp_path: Path) -> None:
+    """Test that load_suite distributes num_prompts equally across categories."""
+    prompt_file = tmp_path / "test_prompts.jsonl"
+
+    # 10 math, 5 code, 2 logic
+    prompts = []
+    for i in range(10):
+        prompts.append({"id": f"math-{i}", "text": f"math {i}", "category": "math"})
+    for i in range(5):
+        prompts.append({"id": f"code-{i}", "text": f"code {i}", "category": "code"})
+    for i in range(2):
+        prompts.append({"id": f"logic-{i}", "text": f"logic {i}", "category": "logic"})
+
+    prompt_file.write_text("\n".join(json.dumps(p) for p in prompts))
+
+    # Request 6 prompts.
+    # Round 1: math-0, code-0, logic-0 (3 total)
+    # Round 2: math-1, code-1, logic-1 (6 total)
+    # Categories: code, logic, math (sorted)
+    # Round 1: code-0, logic-0, math-0
+    # Round 2: code-1, logic-1, math-1
+    loaded = load_suite(prompt_file, num_prompts=6)
+
+    assert len(loaded) == 6
+    categories = [p.category for p in loaded]
+    from collections import Counter
+
+    counts = Counter(categories)
+
+    assert counts["math"] == 2
+    assert counts["code"] == 2
+    assert counts["logic"] == 2
+
+    # Request 4 prompts
+    # Round 1: code-0, logic-0, math-0 (3 total)
+    # Round 2: code-1 (4 total)
+    loaded_4 = load_suite(prompt_file, num_prompts=4)
+    assert len(loaded_4) == 4
+    counts_4 = Counter([p.category for p in loaded_4])
+    assert counts_4["code"] == 2
+    assert counts_4["logic"] == 1
+    assert counts_4["math"] == 1
+
+
+def test_load_suite_uneven_categories(tmp_path: Path) -> None:
+    """Test distribution when some categories are exhausted."""
+    prompt_file = tmp_path / "test_prompts_uneven.jsonl"
+
+    # 5 math, 1 code
+    prompts = []
+    for i in range(5):
+        prompts.append({"id": f"math-{i}", "text": f"math {i}", "category": "math"})
+    prompts.append({"id": "code-0", "text": "code 0", "category": "code"})
+
+    prompt_file.write_text("\n".join(json.dumps(p) for p in prompts))
+
+    # Request 4 prompts.
+    # Sorted categories: code, math
+    # Round 1: code-0, math-0
+    # Round 2: (code exhausted), math-1
+    # Round 3: math-2
+    loaded = load_suite(prompt_file, num_prompts=4)
+
+    assert len(loaded) == 4
+    counts = {p.category: 0 for p in loaded}
+    for p in loaded:
+        counts[p.category] += 1
+
+    assert counts["code"] == 1
+    assert counts["math"] == 3
+
+
+def test_load_suite_no_limit(tmp_path: Path) -> None:
+    """Test that load_suite returns all prompts if no limit is provided."""
+    prompt_file = tmp_path / "test_prompts_all.jsonl"
+    prompts = [{"id": "1", "text": "t1", "category": "a"}, {"id": "2", "text": "t2", "category": "b"}]
+    prompt_file.write_text("\n".join(json.dumps(p) for p in prompts))
+
+    loaded = load_suite(prompt_file)
+    assert len(loaded) == 2