fix: improve OpenAI compatibility error handling and category assignment

NullPointerDepressiveDisorder · NullPointerDepressiveDisorder · commit e2bb1e050806 · 2026-04-14T01:53:29.000-07:00
- Introduce _ServerHTTPError for clearer HTTP error propagation in OpenAICompatBackend
- Refine logprobs retry logic to use status codes instead of string matching
- Adjust prompt category assignment to allow None values instead of defaulting to "default"
- Safeguard token extraction in logprobs parsing
diff --git a/src/infer_check/backends/openai_compat.py b/src/infer_check/backends/openai_compat.py
@@ -18,6 +18,14 @@
 __all__ = ["OpenAICompatBackend"]
 
 
+class _ServerHTTPError(RuntimeError):
+    """Internal exception carrying the HTTP status code from the server."""
+
+    def __init__(self, status_code: int, body: str) -> None:
+        self.status_code = status_code
+        super().__init__(f"Server returned HTTP {status_code}: {body}")
+
+
 class OpenAICompatBackend:
     """Backend adapter for any OpenAI-compatible completion server.
 
@@ -86,7 +94,7 @@ async def _post_chat(self, payload: dict[str, Any]) -> tuple[float, dict[str, An
         except httpx.HTTPStatusError as exc:
             status = exc.response.status_code
             body = exc.response.text[:500]
-            raise RuntimeError(f"Server returned HTTP {status}: {body}") from exc
+            raise _ServerHTTPError(status, body) from exc
 
         elapsed_s = time.perf_counter() - start
 
@@ -120,10 +128,9 @@ async def _generate_chat(self, prompt: Prompt) -> InferenceResult:
 
         try:
             elapsed_s, data = await self._post_chat(payload)
-        except RuntimeError as exc:
+        except _ServerHTTPError as exc:
             # Retry without logprobs only on 400/422 (unsupported parameter).
-            msg = str(exc)
-            if self._chat_logprobs_supported and ("HTTP 400" in msg or "HTTP 422" in msg):
+            if self._chat_logprobs_supported and exc.status_code in (400, 422):
                 self._chat_logprobs_supported = False
                 payload.pop("logprobs", None)
                 payload.pop("top_logprobs", None)
@@ -146,7 +153,7 @@ async def _generate_chat(self, prompt: Prompt) -> InferenceResult:
         lp_data = choice.get("logprobs")
         if lp_data and lp_data.get("content"):
             content_logprobs = lp_data["content"]
-            tokens = [entry["token"] for entry in content_logprobs]
+            tokens = [entry.get("token", "") for entry in content_logprobs]
             logprobs_list = []
             for entry in content_logprobs:
                 raw = entry.get("logprob")
diff --git a/src/infer_check/suites/loader.py b/src/infer_check/suites/loader.py
@@ -36,7 +36,7 @@ def load_suite(path: str | Path, num_prompts: int | None = None) -> list[Prompt]
                 data = json.loads(line)
                 prompt = Prompt.model_validate(data)
                 all_prompts.append(prompt)
-                cat = prompt.category or "default"
+                cat = prompt.category
                 if cat not in prompts_by_category:
                     prompts_by_category[cat] = []
                 prompts_by_category[cat].append(prompt)
@@ -75,7 +75,7 @@ def load_suite(path: str | Path, num_prompts: int | None = None) -> list[Prompt]
         final_prompts = all_prompts
 
     # Log summary
-    category_counts = Counter(p.category or "default" for p in final_prompts)
+    category_counts = Counter(p.category for p in final_prompts)
     console.print(f"[bold green]Loaded {len(final_prompts)} prompts from {path_obj.name}[/bold green]")
     for category, count in category_counts.most_common():
         console.print(f"  - {category}: {count}")