From 9bdb143505a6985554d14cd40caf5cafbba41387 Mon Sep 17 00:00:00 2001
From: chris-colinsky <chris@lunarcommand.xyz>
Date: Mon, 1 Jun 2026 08:40:10 -0700
Subject: [PATCH 1/2] Harden OpenAIProvider readiness probe

Add a ``readiness_probe`` constructor kwarg accepting "models",
"chat_completions", or "both", and flip the default from the older
catalog-only ``GET /v1/models`` probe to a new ``POST /v1/chat/
completions`` probe with ``max_tokens=1``.

Motivation: OpenAI-compatible proxies can return 200 on the catalog
endpoint while rejecting completions (Bifrost is the field-reported
case), so the previous probe reported ready while every real call
failed. The new default actually exercises the inference wire path,
so that failure class surfaces at preflight. Non-200 chat-probe
responses route through ``classify_http_error`` so canonical error
categories surface consistently. Catalog-only behavior remains opt-in
for cost-sensitive cloud callers.

Conformance harness picks ``readiness_probe`` mode from the fixture's
mocked ``health_endpoint.path`` so fixture 007's catalog semantics
keep working without spec changes.
---
 CHANGELOG.md                             |   6 +
 docs/agent/non-obvious-shapes.md         |   4 +
 docs/concepts/llms.md                    |  64 +++++
 src/openarmature/AGENTS.md               |   4 +
 src/openarmature/llm/providers/openai.py |  95 ++++++--
 tests/conformance/test_llm_provider.py   |  22 +-
 tests/unit/test_llm_provider.py          | 283 +++++++++++++++++++++++
 7 files changed, 459 insertions(+), 19 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 37bc5ad..6ecfbcd 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -4,6 +4,12 @@ All notable changes to `openarmature-python` are documented in this file.
 
 The format follows [Keep a Changelog](https://keepachangelog.com/en/1.1.0/). The package follows [Semantic Versioning](https://semver.org/); pre-1.0 minor bumps may carry behavioral changes per [spec governance](https://github.com/LunarCommand/openarmature-spec/blob/main/GOVERNANCE.md).
 
+## [Unreleased]
+
+### Changed (breaking)
+
+- **`OpenAIProvider.ready()` default probe flipped to `chat_completions`.** A new constructor kwarg `readiness_probe: Literal["models", "chat_completions", "both"]` selects which wire path `ready()` exercises; the default is now the chat-completions path (`POST /v1/chat/completions` with `max_tokens=1`), which actually exercises the inference path. The previous catalog-only behavior is still available as `readiness_probe="models"`, and `readiness_probe="both"` runs catalog then chat for the strongest signal. Motivation: OpenAI-compatible proxies (Bifrost and similar) can return 200 on `GET /v1/models` while rejecting `POST /v1/chat/completions`, leaving the catalog probe green while every real call fails. The new default surfaces that class of failure at preflight rather than at first inference. Non-200 chat-probe responses route through `classify_http_error`, so the canonical error categories (`provider_authentication`, `provider_unavailable`, `provider_invalid_model`, etc.) surface consistently. Callers that depended on the catalog-only behavior (cost-sensitive cloud setups where every `ready()` would now bill prompt tokens) can opt back in by passing `readiness_probe="models"`.
+
 ## [0.11.0] — 2026-06-01
 
 Observability + prompt-management release. The pinned spec advances from v0.27.1 to v0.38.0, absorbing eight accepted proposals (0039-0046). Two headlines: (1) the Langfuse observer grows native `trace.input` / `trace.output` sourcing with caller hooks (0043) and the per-async-context augmentation boundary becomes lineage-aware for nested fan-out / parallel-branches topologies (0045); (2) prompt-management gains a Chat-prompt variant alongside the existing Text-prompt (0046) and `LangfusePromptBackend` lands for both Langfuse text and chat prompts. Caller-supplied `invocation_id` (0039), mid-invocation open-span metadata update (0040), three reserved-key surfaces (0041 + 0042), and the parallel-branches OTel dispatch span (0044) round out the cycle.
diff --git a/docs/agent/non-obvious-shapes.md b/docs/agent/non-obvious-shapes.md
index 9bc1a1c..f7a1496 100644
--- a/docs/agent/non-obvious-shapes.md
+++ b/docs/agent/non-obvious-shapes.md
@@ -80,6 +80,10 @@ A common shape is "after this LLM call, route to either a JSON-extraction node o
 
 When the branches operate on different sub-shapes of state — e.g., one path is "extract JSON, then validate" while another is "dispatch tools, loop until done, then summarize" — encapsulate each as a `SubgraphNode` and route from the LLM node to the right subgraph. Each subgraph has its own state schema (projected from the parent), its own entry node, and its own internal topology. The parent graph becomes a switchboard with a few edges; the complexity lives one layer down where it composes cleanly.
 
+### `OpenAIProvider.ready()` exercises `chat/completions` by default; opt back into the catalog-only probe for cost-sensitive callers
+
+`OpenAIProvider(..., readiness_probe=...)` accepts `"chat_completions"` (default), `"models"`, or `"both"`. The default issues `POST /v1/chat/completions` with a `max_tokens=1` body so a green `ready()` actually proves the inference wire path works, not just that the catalog endpoint answers. The motivating failure class: OpenAI-compatible proxies (Bifrost is the field-reported case) that return 200 on `GET /v1/models` while 405'ing the completions endpoint — the previous catalog-only default reported ready and every real call broke. The `"models"` opt-in is the old behavior, useful for cost-sensitive cloud callers where every `ready()` would otherwise bill one prompt's worth of tokens. `"both"` runs catalog then chat — strongest signal at double the cost. Non-200 responses on either probe route through `classify_http_error`, so the canonical error categories (`ProviderAuthentication`, `ProviderUnavailable`, `ProviderInvalidModel`, etc.) surface consistently regardless of which probe ran.
+
 ### Be explicit with `tool_choice`; don't trust the provider's default
 
 `Provider.complete(messages, tools, tool_choice=...)` accepts `"auto"`, `"required"`, `"none"`, or a `ForceTool(name=...)` record. When you omit `tool_choice`, the OpenAI provider's own default applies — usually `"auto"` when `tools` is non-empty, but documented per-provider. A pipeline that wants deterministic tool-calling (a routing node that MUST produce a tool call, a guarded LLM call that MUST NOT call tools) should pin `tool_choice` explicitly rather than relying on the provider default.
diff --git a/docs/concepts/llms.md b/docs/concepts/llms.md
index 91f398f..cdcd4c7 100644
--- a/docs/concepts/llms.md
+++ b/docs/concepts/llms.md
@@ -85,6 +85,70 @@ stateless calls. Conversational memory (if you want it) is the
 caller's responsibility: thread it through state and pass the
 accumulated message list into each call.
 
+## Pre-flight readiness check
+
+`Provider.ready()` is the optional pre-flight call you make before
+your application starts taking real traffic. It raises one of the
+canonical [`LlmProviderError`](../reference/llm.md) categories on
+failure and returns `None` on success, so a typical startup hook
+looks like:
+
+```python
+async def startup() -> None:
+    provider = _get_provider()
+    try:
+        await provider.ready()
+    except ProviderAuthentication:
+        # Bad API key — fail fast at boot.
+        raise
+    except ProviderInvalidModel:
+        # Bound model isn't served by this endpoint — same.
+        raise
+    except ProviderUnavailable:
+        # Endpoint is down or unreachable — fail fast too.
+        raise
+```
+
+`OpenAIProvider` ships three probe shapes selected via the
+`readiness_probe` constructor kwarg:
+
+- **`"chat_completions"`** (default) — issues `POST /v1/chat/completions`
+  with a `max_tokens=1` body. Actually exercises the inference wire
+  path. Strongest signal at the cost of one prompt's worth of tokens
+  on cloud endpoints.
+- **`"models"`** — issues `GET /v1/models` and verifies the bound
+  model appears in the catalog. Cheaper (no completion billing) but
+  blind to proxy wire-mismatch cases: some OpenAI-compatible proxies
+  (Bifrost is the motivating example) serve `/v1/models` correctly
+  while 405'ing the completions endpoint, so a green catalog probe
+  doesn't prove `complete()` will work.
+- **`"both"`** — runs the catalog probe first (cheap fail-fast on
+  model-not-in-catalog with the cleaner `seen_ids` diagnostic), then
+  the chat probe. Strongest signal at double the round-trip cost.
+
+```python
+# Local server (LM Studio, vLLM, llama.cpp) — chat probe is free.
+provider = OpenAIProvider(
+    base_url="http://localhost:8000",
+    model="qwen2.5-coder",
+    readiness_probe="chat_completions",  # default
+)
+
+# Cloud endpoint, cost-sensitive — opt back into the catalog-only probe.
+provider = OpenAIProvider(
+    base_url="https://api.openai.com",
+    model="gpt-4o-mini",
+    api_key=os.environ["LLM_API_KEY"],
+    readiness_probe="models",
+)
+```
+
+The chat probe is the default because the catalog probe's
+false-green failure mode (Bifrost-style proxy mismatch) is silent at
+boot but fatal at first real call, and that's worse than the extra
+token spend for the small set of cost-sensitive callers who can opt
+out explicitly.
+
 ## Structured output
 
 Every LLM-using node that produces typed data ends up with the same
diff --git a/src/openarmature/AGENTS.md b/src/openarmature/AGENTS.md
index 9b1afa6..21e6985 100644
--- a/src/openarmature/AGENTS.md
+++ b/src/openarmature/AGENTS.md
@@ -943,6 +943,10 @@ A common shape is "after this LLM call, route to either a JSON-extraction node o
 
 When the branches operate on different sub-shapes of state — e.g., one path is "extract JSON, then validate" while another is "dispatch tools, loop until done, then summarize" — encapsulate each as a `SubgraphNode` and route from the LLM node to the right subgraph. Each subgraph has its own state schema (projected from the parent), its own entry node, and its own internal topology. The parent graph becomes a switchboard with a few edges; the complexity lives one layer down where it composes cleanly.
 
+### `OpenAIProvider.ready()` exercises `chat/completions` by default; opt back into the catalog-only probe for cost-sensitive callers
+
+`OpenAIProvider(..., readiness_probe=...)` accepts `"chat_completions"` (default), `"models"`, or `"both"`. The default issues `POST /v1/chat/completions` with a `max_tokens=1` body so a green `ready()` actually proves the inference wire path works, not just that the catalog endpoint answers. The motivating failure class: OpenAI-compatible proxies (Bifrost is the field-reported case) that return 200 on `GET /v1/models` while 405'ing the completions endpoint — the previous catalog-only default reported ready and every real call broke. The `"models"` opt-in is the old behavior, useful for cost-sensitive cloud callers where every `ready()` would otherwise bill one prompt's worth of tokens. `"both"` runs catalog then chat — strongest signal at double the cost. Non-200 responses on either probe route through `classify_http_error`, so the canonical error categories (`ProviderAuthentication`, `ProviderUnavailable`, `ProviderInvalidModel`, etc.) surface consistently regardless of which probe ran.
+
 ### Be explicit with `tool_choice`; don't trust the provider's default
 
 `Provider.complete(messages, tools, tool_choice=...)` accepts `"auto"`, `"required"`, `"none"`, or a `ForceTool(name=...)` record. When you omit `tool_choice`, the OpenAI provider's own default applies — usually `"auto"` when `tools` is non-empty, but documented per-provider. A pipeline that wants deterministic tool-calling (a routing node that MUST produce a tool call, a guarded LLM call that MUST NOT call tools) should pin `tool_choice` explicitly rather than relying on the provider default.
diff --git a/src/openarmature/llm/providers/openai.py b/src/openarmature/llm/providers/openai.py
index 73fa25e..34c311c 100644
--- a/src/openarmature/llm/providers/openai.py
+++ b/src/openarmature/llm/providers/openai.py
@@ -22,20 +22,30 @@
 | HTTP 5xx (other)                                  | provider_unavailable         |
 | 200 OK that fails to parse into Response shape    | provider_invalid_response    |
 
-**``ready()`` probe.** Hits ``GET /v1/models`` and:
-
-- 401/403 → ``provider_authentication``.
-- 5xx / connection error → ``provider_unavailable``.
-- 200 + bound model in returned list → success.
-- 200 + bound model NOT in list → ``provider_invalid_model``.
-
-The ``provider_model_not_loaded`` distinction needs a server-specific
-probe (LM Studio's loaded-vs-configured endpoint, vLLM's health
-endpoint, llama.cpp's runtime-status endpoint) that this base
-provider can't generically emit. Subclasses or purpose-built
-local-server provider variants close that gap; the base
-``OpenAIProvider`` documents the limitation here rather than silently
-treating "model in catalog" as "model loaded."
+**``ready()`` probe.** Three modes selected by the constructor kwarg
+``readiness_probe``:
+
+- ``"chat_completions"`` (default) — issues ``POST /v1/chat/completions``
+  with a minimal ``max_tokens=1`` body. Actually exercises the inference
+  path, so OpenAI-compatible proxies (Bifrost, custom gateways) that
+  return 200 on ``GET /v1/models`` but reject ``POST /v1/chat/completions``
+  surface immediately rather than at first real call.
+- ``"models"`` — hits ``GET /v1/models`` and verifies the bound model
+  appears in the returned catalog. Cheaper (no completion tokens
+  billed) but doesn't catch the wire-mismatch case above.
+- ``"both"`` — runs the catalog probe first, then the chat probe.
+  Catalog check short-circuits with the cleaner "model not in catalog"
+  diagnostic before any billable call.
+
+All three modes map non-200 responses through ``classify_http_error``
+so the canonical error categories (``provider_authentication``,
+``provider_unavailable``, ``provider_invalid_model``,
+``provider_model_not_loaded``, ``provider_rate_limit``) surface
+consistently regardless of which probe ran.
+
+The previous default was ``"models"``; flipped to ``"chat_completions"``
+because the catalog probe missed a real failure class (proxy wire-
+format mismatch) in field use.
 """
 
 from __future__ import annotations
@@ -139,6 +149,7 @@ def __init__(
         timeout: float = 60.0,
         force_prompt_augmentation_fallback: bool = False,
         genai_system: str = "openai",
+        readiness_probe: Literal["models", "chat_completions", "both"] = "chat_completions",
     ) -> None:
         self.base_url = _validate_and_normalize_base_url(base_url)
         self.model = model
@@ -157,6 +168,16 @@ def __init__(
         # those servers, and a wrong inference is worse than the explicit
         # opt-in.
         self._genai_system = genai_system
+        # ``readiness_probe`` selects which wire path ``ready()`` exercises.
+        # The default ``"chat_completions"`` actually tests inference; the
+        # opt-in ``"models"`` is the older catalog-only probe for
+        # cost-sensitive cloud callers (every chat probe bills prompt
+        # tokens). ``"both"`` runs catalog then chat for the strongest
+        # signal at double the round-trip cost. Same explicit-opt-in
+        # rationale as ``genai_system``: no base_url sniffing, since the
+        # right probe shape depends on what's on the other end and a
+        # wrong inference is worse than a wrong default.
+        self._readiness_probe = readiness_probe
         self._headers: dict[str, str] = {"Content-Type": "application/json"}
         if api_key is not None:
             self._headers["Authorization"] = f"Bearer {api_key}"
@@ -188,9 +209,29 @@ async def aclose(self) -> None:
     # ------------------------------------------------------------------
 
     async def ready(self) -> None:
-        """Verify the bound model is reachable and listed by the
-        provider. Hits ``GET /v1/models`` and matches ``self.model``
-        against the returned ``data[].id`` entries."""
+        """Verify the bound model is reachable. Dispatches on the
+        ``readiness_probe`` mode chosen at construction:
+
+        - ``"chat_completions"`` (default) issues a ``max_tokens=1``
+          chat call against ``POST /v1/chat/completions``.
+        - ``"models"`` issues ``GET /v1/models`` and matches
+          ``self.model`` against the returned ``data[].id`` entries.
+        - ``"both"`` runs the catalog probe first (cheaper, surfaces
+          model-not-in-catalog with the catalog diagnostic), then the
+          chat probe.
+        """
+        if self._readiness_probe in ("models", "both"):
+            await self._probe_models()
+        if self._readiness_probe in ("chat_completions", "both"):
+            await self._probe_chat_completions()
+
+    async def _probe_models(self) -> None:
+        """Catalog probe — ``GET /v1/models`` + bound-model presence
+        check. Cheaper than the chat probe (no completion tokens
+        billed) and surfaces the model-not-in-catalog case with the
+        cleaner ``seen_ids`` diagnostic; misses wire-format mismatches
+        on proxies that serve the catalog correctly but reject
+        completions."""
         try:
             resp = await self._client.get("/v1/models")
         except httpx.HTTPError as exc:
@@ -246,6 +287,26 @@ async def ready(self) -> None:
                     f"model {self.model!r} is configured but not loaded (status={status_field!r})"
                 )
 
+    async def _probe_chat_completions(self) -> None:
+        """Inference probe — ``POST /v1/chat/completions`` with a
+        ``max_tokens=1`` body. Surfaces wire-format mismatches that
+        the catalog probe can't see (the motivating case: Bifrost-
+        style proxies that 200 on ``/v1/models`` but 405/404 on
+        ``/v1/chat/completions``). Bills one prompt's worth of tokens
+        on cloud endpoints, which is why this defaults on but is
+        opt-out via ``readiness_probe="models"``."""
+        body = {
+            "model": self.model,
+            "messages": [{"role": "user", "content": "."}],
+            "max_tokens": 1,
+        }
+        try:
+            resp = await self._client.post("/v1/chat/completions", json=body)
+        except httpx.HTTPError as exc:
+            raise ProviderUnavailable(str(exc)) from exc
+        if resp.status_code != 200:
+            raise classify_http_error(resp)
+
     # ------------------------------------------------------------------
     # complete() — single completion call
     # ------------------------------------------------------------------
diff --git a/tests/conformance/test_llm_provider.py b/tests/conformance/test_llm_provider.py
index 59ad4cc..6c4279d 100644
--- a/tests/conformance/test_llm_provider.py
+++ b/tests/conformance/test_llm_provider.py
@@ -21,7 +21,7 @@
 import json
 from collections.abc import Awaitable, Callable, Iterator, Mapping
 from pathlib import Path
-from typing import Any, cast
+from typing import Any, Literal, cast
 
 import httpx
 import pytest
@@ -156,8 +156,25 @@ def _build_provider(
 ) -> tuple[OpenAIProvider, list[httpx.Request]]:
     # Some fixtures (007 ready-check) use ``health_endpoint`` instead
     # of ``responses`` — that's the same shape, just one entry.
+    readiness_probe: Literal["models", "chat_completions", "both"] = "chat_completions"
     if "health_endpoint" in mock_provider_cfg:
-        responses: list[Mapping[str, Any]] = [cast("Mapping[str, Any]", mock_provider_cfg["health_endpoint"])]
+        health_endpoint = cast("Mapping[str, Any]", mock_provider_cfg["health_endpoint"])
+        responses: list[Mapping[str, Any]] = [health_endpoint]
+        # The spec fixture intentionally leaves the probe shape to the
+        # implementation (007 comment: "the implementation's chosen probe").
+        # Pick the OpenAIProvider readiness_probe mode that matches the
+        # fixture's mocked path so the mock is actually exercised. Fixtures
+        # 007's cases all mock ``/v1/models``, so the catalog probe is what
+        # they verify; a future fixture mocking ``/v1/chat/completions``
+        # would automatically route to the chat probe. A missing ``path``
+        # field leaves us at the OpenAIProvider default; all current
+        # fixtures populate ``path``, so this branch is unreachable in
+        # practice and the fallthrough is only defensive.
+        endpoint_path = health_endpoint.get("path")
+        if endpoint_path == "/v1/models":
+            readiness_probe = "models"
+        elif endpoint_path == "/v1/chat/completions":
+            readiness_probe = "chat_completions"
     else:
         responses = cast(
             "list[Mapping[str, Any]]",
@@ -176,6 +193,7 @@ def _build_provider(
         api_key="test-key",
         transport=transport,
         force_prompt_augmentation_fallback=force_fallback,
+        readiness_probe=readiness_probe,
     )
     return provider, captured
 
diff --git a/tests/unit/test_llm_provider.py b/tests/unit/test_llm_provider.py
index 8aa1afd..7561843 100644
--- a/tests/unit/test_llm_provider.py
+++ b/tests/unit/test_llm_provider.py
@@ -573,3 +573,286 @@ def test_runtime_config_from_partial_empty() -> None:
     assert config.temperature is None
     assert config.frequency_penalty is None
     assert config.stop_sequences is None
+
+
+# ---------------------------------------------------------------------------
+# ready() readiness_probe modes
+# ---------------------------------------------------------------------------
+# Verifies the v0.12.0 default-flip: chat_completions is the strict probe
+# (actually exercises inference), models is the opt-in catalog-only probe,
+# both runs catalog then chat. Conformance fixture 007 owns the catalog-only
+# semantics — these tests cover the new wire paths and the dispatch.
+
+
+async def test_ready_chat_completions_200_passes() -> None:
+    def _ok(req: httpx.Request) -> httpx.Response:
+        assert req.url.path == "/v1/chat/completions"
+        return httpx.Response(
+            200,
+            json={
+                "id": "x",
+                "object": "chat.completion",
+                "created": 0,
+                "model": "m",
+                "choices": [
+                    {
+                        "index": 0,
+                        "message": {"role": "assistant", "content": "."},
+                        "finish_reason": "length",
+                    }
+                ],
+                "usage": {"prompt_tokens": 1, "completion_tokens": 1, "total_tokens": 2},
+            },
+        )
+
+    provider = OpenAIProvider(
+        base_url="http://test",
+        model="m",
+        api_key="k",
+        transport=httpx.MockTransport(_ok),
+    )
+    try:
+        await provider.ready()
+    finally:
+        await provider.aclose()
+
+
+async def test_ready_chat_completions_405_surfaces_unavailable() -> None:
+    # The Bifrost-style proxy case: the catalog endpoint may answer 200
+    # (which the older default probe accepted), but POST /v1/chat/completions
+    # returns 405 because the proxy doesn't actually serve completions.
+    # classify_http_error routes 405 through ProviderUnavailable.
+    def _405(req: httpx.Request) -> httpx.Response:
+        assert req.url.path == "/v1/chat/completions"
+        return httpx.Response(405, json={"error": {"message": "method not allowed"}})
+
+    provider = OpenAIProvider(
+        base_url="http://test",
+        model="m",
+        api_key="k",
+        transport=httpx.MockTransport(_405),
+    )
+    try:
+        with pytest.raises(ProviderUnavailable):
+            await provider.ready()
+    finally:
+        await provider.aclose()
+
+
+async def test_ready_chat_completions_401_surfaces_authentication() -> None:
+    def _401(_req: httpx.Request) -> httpx.Response:
+        return httpx.Response(401, json={"error": {"message": "Invalid API key"}})
+
+    provider = OpenAIProvider(
+        base_url="http://test",
+        model="m",
+        api_key="k",
+        transport=httpx.MockTransport(_401),
+    )
+    try:
+        with pytest.raises(ProviderAuthentication):
+            await provider.ready()
+    finally:
+        await provider.aclose()
+
+
+async def test_ready_chat_completions_404_model_not_found_surfaces_invalid_model() -> None:
+    def _404(_req: httpx.Request) -> httpx.Response:
+        return httpx.Response(
+            404,
+            json={"error": {"code": "model_not_found", "message": "no such model 'm'"}},
+        )
+
+    provider = OpenAIProvider(
+        base_url="http://test",
+        model="m",
+        api_key="k",
+        transport=httpx.MockTransport(_404),
+    )
+    try:
+        with pytest.raises(ProviderInvalidModel):
+            await provider.ready()
+    finally:
+        await provider.aclose()
+
+
+async def test_ready_both_runs_catalog_then_chat() -> None:
+    # The both-mode contract: catalog probe first (so catalog-only failures
+    # short-circuit before the billable chat call), then chat probe. This
+    # test verifies both endpoints get hit in order on the happy path.
+    seen: list[str] = []
+
+    def _handler(req: httpx.Request) -> httpx.Response:
+        seen.append(req.url.path)
+        if req.url.path == "/v1/models":
+            return httpx.Response(
+                200,
+                json={"object": "list", "data": [{"id": "m", "object": "model"}]},
+            )
+        return httpx.Response(
+            200,
+            json={
+                "id": "x",
+                "object": "chat.completion",
+                "created": 0,
+                "model": "m",
+                "choices": [
+                    {
+                        "index": 0,
+                        "message": {"role": "assistant", "content": "."},
+                        "finish_reason": "length",
+                    }
+                ],
+                "usage": {"prompt_tokens": 1, "completion_tokens": 1, "total_tokens": 2},
+            },
+        )
+
+    provider = OpenAIProvider(
+        base_url="http://test",
+        model="m",
+        api_key="k",
+        transport=httpx.MockTransport(_handler),
+        readiness_probe="both",
+    )
+    try:
+        await provider.ready()
+    finally:
+        await provider.aclose()
+    assert seen == ["/v1/models", "/v1/chat/completions"]
+
+
+async def test_ready_both_catalog_missing_model_short_circuits() -> None:
+    # When catalog returns 200 but the bound model isn't in the list, the
+    # catalog probe MUST raise ProviderInvalidModel before any chat call.
+    # The single ``seen``-tracking handler answers any path with the catalog
+    # JSON, so a leaked chat call would silently 200; the real proof of
+    # short-circuit is the seen == ["/v1/models"] assertion at the bottom.
+    seen: list[str] = []
+
+    def _handler(req: httpx.Request) -> httpx.Response:
+        seen.append(req.url.path)
+        return httpx.Response(
+            200,
+            json={
+                "object": "list",
+                "data": [{"id": "other-model", "object": "model"}],
+            },
+        )
+
+    provider = OpenAIProvider(
+        base_url="http://test",
+        model="m",
+        api_key="k",
+        transport=httpx.MockTransport(_handler),
+        readiness_probe="both",
+    )
+    try:
+        with pytest.raises(ProviderInvalidModel):
+            await provider.ready()
+    finally:
+        await provider.aclose()
+    assert seen == ["/v1/models"]
+
+
+async def test_ready_models_mode_still_hits_catalog() -> None:
+    # Explicit opt-in to the old default. Verifies the dispatch routes
+    # models-mode through the catalog endpoint and ignores the chat path.
+    seen: list[str] = []
+
+    def _handler(req: httpx.Request) -> httpx.Response:
+        seen.append(req.url.path)
+        return httpx.Response(
+            200,
+            json={"object": "list", "data": [{"id": "m", "object": "model"}]},
+        )
+
+    provider = OpenAIProvider(
+        base_url="http://test",
+        model="m",
+        api_key="k",
+        transport=httpx.MockTransport(_handler),
+        readiness_probe="models",
+    )
+    try:
+        await provider.ready()
+    finally:
+        await provider.aclose()
+    assert seen == ["/v1/models"]
+
+
+async def test_ready_both_catalog_200_chat_405_surfaces_unavailable() -> None:
+    # The actual Bifrost case via ``both`` mode: catalog probe sees 200 from
+    # ``/v1/models`` with the bound model present, then the chat probe gets
+    # 405. classify_http_error routes 405 to ProviderUnavailable. This is
+    # the failure shape ``both`` mode is meant to surface that the catalog-
+    # only probe missed.
+    seen: list[str] = []
+
+    def _handler(req: httpx.Request) -> httpx.Response:
+        seen.append(req.url.path)
+        if req.url.path == "/v1/models":
+            return httpx.Response(
+                200,
+                json={"object": "list", "data": [{"id": "m", "object": "model"}]},
+            )
+        return httpx.Response(405, json={"error": {"message": "method not allowed"}})
+
+    provider = OpenAIProvider(
+        base_url="http://test",
+        model="m",
+        api_key="k",
+        transport=httpx.MockTransport(_handler),
+        readiness_probe="both",
+    )
+    try:
+        with pytest.raises(ProviderUnavailable):
+            await provider.ready()
+    finally:
+        await provider.aclose()
+    assert seen == ["/v1/models", "/v1/chat/completions"]
+
+
+async def test_ready_chat_completions_network_error_surfaces_unavailable() -> None:
+    # httpx network-layer failures (ConnectError, ReadTimeout, etc.) on the
+    # chat probe wrap into ProviderUnavailable, same as on the catalog
+    # probe. Fixture 007's network_failure case covers the catalog side;
+    # this covers the chat-probe side that the catalog fixture can't reach
+    # under the new default.
+    def _raises(_req: httpx.Request) -> httpx.Response:
+        raise httpx.ConnectError("connection refused")
+
+    provider = OpenAIProvider(
+        base_url="http://test",
+        model="m",
+        api_key="k",
+        transport=httpx.MockTransport(_raises),
+    )
+    try:
+        with pytest.raises(ProviderUnavailable, match="connection refused"):
+            await provider.ready()
+    finally:
+        await provider.aclose()
+
+
+async def test_ready_chat_completions_503_model_not_loaded() -> None:
+    # 503 with a model-not-loaded body routes through classify_http_error
+    # to ProviderModelNotLoaded. Covered indirectly by the classifier's
+    # own tests, but a single test here pins the dispatch from the chat
+    # probe specifically.
+    def _503(_req: httpx.Request) -> httpx.Response:
+        return httpx.Response(
+            503,
+            json={"error": {"type": "model_not_loaded", "message": "model is not loaded yet"}},
+        )
+
+    provider = OpenAIProvider(
+        base_url="http://test",
+        model="m",
+        api_key="k",
+        transport=httpx.MockTransport(_503),
+    )
+    try:
+        with pytest.raises(ProviderModelNotLoaded):
+            await provider.ready()
+    finally:
+        await provider.aclose()

From dec61674f260966617345e15155180254da1b087 Mon Sep 17 00:00:00 2001
From: chris-colinsky <chris@lunarcommand.xyz>
Date: Mon, 1 Jun 2026 09:04:29 -0700
Subject: [PATCH 2/2] Tighten readiness probe per CoPilot review
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Three findings on PR #109:

1. Runtime guard for ``readiness_probe``. The Literal type is a static
   hint; an unknown string would silently no-op both dispatch branches
   in ``ready()`` and report ready. Validate in ``__init__`` against a
   module-level frozenset and raise ValueError.
2. Route ``_probe_models`` non-200 responses through
   ``classify_http_error``. Previously hard-coded 401/403 to
   ProviderAuthentication and everything-else to ProviderUnavailable,
   missing ProviderRateLimit (429), ProviderModelNotLoaded (503+marker),
   and ProviderInvalidModel (404+marker). The docstring's
   mode-independence claim is now true.
3. Validate ``_probe_chat_completions`` 200 response shape. A proxy
   answering 200 with an error payload or non-OpenAI-shape JSON
   previously passed the probe. Mirror ``_do_complete``'s parse +
   ``_parse_response(payload, None, None)`` step.

Adds five new tests covering: invalid mode at construction, catalog
probe 429 → ProviderRateLimit, catalog probe 503+marker →
ProviderModelNotLoaded, chat probe 200 with error payload, and chat
probe 200 with non-JSON body.
---
 src/openarmature/llm/providers/openai.py |  28 ++++--
 tests/unit/test_llm_provider.py          | 105 +++++++++++++++++++++++
 2 files changed, 128 insertions(+), 5 deletions(-)

diff --git a/src/openarmature/llm/providers/openai.py b/src/openarmature/llm/providers/openai.py
index 34c311c..ee96a14 100644
--- a/src/openarmature/llm/providers/openai.py
+++ b/src/openarmature/llm/providers/openai.py
@@ -114,6 +114,13 @@
 )
 from ..response import FinishReason, ParsedValue, Response, RuntimeConfig, Usage
 
+# Runtime guard for ``OpenAIProvider(..., readiness_probe=...)``. The
+# Literal type narrows callers under static checkers but is not enforced
+# at runtime, so an unknown string would silently no-op both dispatch
+# branches in ``ready()`` and return None — a false-green readiness
+# signal. Validate in ``__init__`` against this set instead.
+_VALID_READINESS_PROBES = frozenset({"models", "chat_completions", "both"})
+
 
 class OpenAIProvider:
     """OpenAI Chat Completions wire-compatible provider.
@@ -177,6 +184,10 @@ def __init__(
         # rationale as ``genai_system``: no base_url sniffing, since the
         # right probe shape depends on what's on the other end and a
         # wrong inference is worse than a wrong default.
+        if readiness_probe not in _VALID_READINESS_PROBES:
+            raise ValueError(
+                f"readiness_probe must be one of {sorted(_VALID_READINESS_PROBES)} (got {readiness_probe!r})"
+            )
         self._readiness_probe = readiness_probe
         self._headers: dict[str, str] = {"Content-Type": "application/json"}
         if api_key is not None:
@@ -237,12 +248,8 @@ async def _probe_models(self) -> None:
         except httpx.HTTPError as exc:
             raise ProviderUnavailable(str(exc)) from exc
 
-        if resp.status_code in (401, 403):
-            raise ProviderAuthentication(f"GET /v1/models returned {resp.status_code}")
-        if 500 <= resp.status_code < 600:
-            raise ProviderUnavailable(f"GET /v1/models returned {resp.status_code}")
         if resp.status_code != 200:
-            raise ProviderUnavailable(f"GET /v1/models returned unexpected {resp.status_code}")
+            raise classify_http_error(resp)
 
         try:
             body_raw = resp.json()
@@ -306,6 +313,17 @@ async def _probe_chat_completions(self) -> None:
             raise ProviderUnavailable(str(exc)) from exc
         if resp.status_code != 200:
             raise classify_http_error(resp)
+        # Validate the response shape so a proxy answering 200 with an
+        # error payload or non-OpenAI-shape JSON doesn't pass the probe.
+        # Mirrors ``_do_complete``'s parse step. The returned Response
+        # is discarded — the validation itself is the point.
+        try:
+            payload_raw = resp.json()
+        except ValueError as exc:
+            raise ProviderInvalidResponse("POST /v1/chat/completions returned non-JSON body") from exc
+        if not isinstance(payload_raw, dict):
+            raise ProviderInvalidResponse("POST /v1/chat/completions returned a non-object body")
+        self._parse_response(cast("dict[str, Any]", payload_raw), None, None)
 
     # ------------------------------------------------------------------
     # complete() — single completion call
diff --git a/tests/unit/test_llm_provider.py b/tests/unit/test_llm_provider.py
index 7561843..5b3db21 100644
--- a/tests/unit/test_llm_provider.py
+++ b/tests/unit/test_llm_provider.py
@@ -575,6 +575,19 @@ def test_runtime_config_from_partial_empty() -> None:
     assert config.stop_sequences is None
 
 
+def test_readiness_probe_unknown_mode_rejected_at_construction() -> None:
+    # Literal type is a static hint, not a runtime guard. Unknown modes
+    # would otherwise silently no-op both dispatch branches in ready()
+    # and report ready, so reject at construction.
+    with pytest.raises(ValueError, match="readiness_probe must be one of"):
+        OpenAIProvider(
+            base_url="http://test",
+            model="m",
+            api_key="k",
+            readiness_probe="bogus",  # pyright: ignore[reportArgumentType]
+        )
+
+
 # ---------------------------------------------------------------------------
 # ready() readiness_probe modes
 # ---------------------------------------------------------------------------
@@ -780,6 +793,57 @@ def _handler(req: httpx.Request) -> httpx.Response:
     assert seen == ["/v1/models"]
 
 
+async def test_ready_models_429_surfaces_rate_limit() -> None:
+    # The catalog probe routes non-200 through classify_http_error, so
+    # 429 with a Retry-After lands as ProviderRateLimit carrying the
+    # parsed delay. Pre-refactor _probe_models would have flattened this
+    # to ProviderUnavailable.
+    def _429(_req: httpx.Request) -> httpx.Response:
+        return httpx.Response(
+            429,
+            headers={"Retry-After": "30"},
+            json={"error": {"message": "rate limited"}},
+        )
+
+    provider = OpenAIProvider(
+        base_url="http://test",
+        model="m",
+        api_key="k",
+        transport=httpx.MockTransport(_429),
+        readiness_probe="models",
+    )
+    try:
+        with pytest.raises(ProviderRateLimit) as excinfo:
+            await provider.ready()
+    finally:
+        await provider.aclose()
+    assert excinfo.value.retry_after == 30.0
+
+
+async def test_ready_models_503_model_not_loaded_surfaces_canonical_category() -> None:
+    # 503 with a model-not-loaded marker now lands as
+    # ProviderModelNotLoaded on the catalog probe too, not the previous
+    # generic ProviderUnavailable.
+    def _503(_req: httpx.Request) -> httpx.Response:
+        return httpx.Response(
+            503,
+            json={"error": {"type": "model_not_loaded", "message": "model is not loaded yet"}},
+        )
+
+    provider = OpenAIProvider(
+        base_url="http://test",
+        model="m",
+        api_key="k",
+        transport=httpx.MockTransport(_503),
+        readiness_probe="models",
+    )
+    try:
+        with pytest.raises(ProviderModelNotLoaded):
+            await provider.ready()
+    finally:
+        await provider.aclose()
+
+
 async def test_ready_both_catalog_200_chat_405_surfaces_unavailable() -> None:
     # The actual Bifrost case via ``both`` mode: catalog probe sees 200 from
     # ``/v1/models`` with the bound model present, then the chat probe gets
@@ -834,6 +898,47 @@ def _raises(_req: httpx.Request) -> httpx.Response:
         await provider.aclose()
 
 
+async def test_ready_chat_completions_200_with_error_payload_surfaces_invalid_response() -> None:
+    # The residual false-green class: a proxy returning 200 with an
+    # error payload (no ``choices`` field) would pass a simple status
+    # check but indicates a deeply broken inference path. The chat probe
+    # now parses the response shape so this fails with
+    # ProviderInvalidResponse rather than reporting ready.
+    def _200_error(_req: httpx.Request) -> httpx.Response:
+        return httpx.Response(200, json={"error": "something is wrong"})
+
+    provider = OpenAIProvider(
+        base_url="http://test",
+        model="m",
+        api_key="k",
+        transport=httpx.MockTransport(_200_error),
+    )
+    try:
+        with pytest.raises(ProviderInvalidResponse):
+            await provider.ready()
+    finally:
+        await provider.aclose()
+
+
+async def test_ready_chat_completions_200_with_non_json_body_surfaces_invalid_response() -> None:
+    # Same false-green class, JSON-parse leg: a proxy returning 200 with
+    # a non-JSON body (HTML error page, plain text) must not pass.
+    def _200_html(_req: httpx.Request) -> httpx.Response:
+        return httpx.Response(200, content=b"<html>error</html>", headers={"content-type": "text/html"})
+
+    provider = OpenAIProvider(
+        base_url="http://test",
+        model="m",
+        api_key="k",
+        transport=httpx.MockTransport(_200_html),
+    )
+    try:
+        with pytest.raises(ProviderInvalidResponse, match="non-JSON"):
+            await provider.ready()
+    finally:
+        await provider.aclose()
+
+
 async def test_ready_chat_completions_503_model_not_loaded() -> None:
     # 503 with a model-not-loaded body routes through classify_http_error
     # to ProviderModelNotLoaded. Covered indirectly by the classifier's