fix(llm): address CoPilot review on PR #42

chris-colinsky · chris-colinsky · commit 03bcf23d54bc · 2026-05-15T11:07:45.000-07:00
Addresses the 8 CoPilot review threads on the structured-output PR:

- strict_mode_supported now requires additionalProperties to be
  EXPLICITLY false (not just missing-or-false). Missing implies the
  JSON Schema default of permitting extras, which OpenAI's strict
  mode rejects. Pydantic's .model_json_schema() omits the key by
  default, so the class-input path would have 400ed against OpenAI
  even with conformance fixtures passing.
- _normalize_response_schema now raises ProviderInvalidRequest when
  the class form is not a BaseModel subclass, instead of letting
  AttributeError leak from model_json_schema.
- validate_response_schema now runs jsonschema.Draft202012Validator
  .check_schema() at the boundary, wrapping SchemaError as
  ProviderInvalidRequest. Malformed schemas now fail at the API
  boundary instead of escaping at decode time.
- _derive_schema_name now regex-checks the title against OpenAI's
  name constraint (^[a-zA-Z0-9_-]{1,64}$) and falls back to the
  hashed name when the title doesn't match. Sanitizing-in-place
  would silently mutate user intent; the hash is a more honest
  fallback.
- Two comments claiming Message instances are immutable Pydantic
  models were updated. The models are not configured with
  frozen=True; the safety actually comes from the helpers not
  modifying them in place.
- match_wire_body now fails on extra keys in actual. The previous
  permissive default defeated the point of expected_wire_request
  being a literal compare; partial assertions continue to live in
  the sibling expected_wire_request_checks block.
- _iter_calls now propagates expected_wire_request,
  expected_wire_request_checks, response_schema, and
  retry_middleware from sibling-of-call into the call dict. Only
  expected was being copied before. Cases-form fixtures with
  case-level wire expectations were silently running without those
  assertions.

The _iter_calls fix surfaced two pre-existing gaps in the harness's
handling of cases-shape fixtures, fixed inline:
- The harness was never wiring config from the call spec into
  provider.complete(); fixture 005's runtime_config_passthrough
  case was effectively a no-op.
- OpenAIProvider was using json.dumps default formatting for
  tool_call.function.arguments (with spaces after colons), which
  doesn't match the canonical compact form OpenAI emits or the
  spec's fixture 005 expectations. Switched to compact form.

New unit tests cover the missing-additionalProperties strict-mode
case, the non-BaseModel class rejection, the malformed JSON Schema
rejection, and the title-falls-back hash cases.
diff --git a/src/openarmature/llm/provider.py b/src/openarmature/llm/provider.py
@@ -38,6 +38,7 @@
 from collections.abc import Sequence
 from typing import Any, Protocol, cast
 
+import jsonschema
 from pydantic import BaseModel
 
 from .errors import ProviderInvalidRequest
@@ -184,8 +185,9 @@ def validate_response_schema(schema: object) -> None:
     """Pre-send validation for a JSON Schema passed as the
     ``response_schema`` argument to ``complete()``.
 
-    Raises :class:`ProviderInvalidRequest` if the schema is not a dict
-    or does not declare a top-level object type.
+    Raises :class:`ProviderInvalidRequest` if the schema is not a dict,
+    does not declare a top-level object type, or is not a valid JSON
+    Schema document.
     """
     if not isinstance(schema, dict):
         raise ProviderInvalidRequest(f"response_schema: MUST be a dict (got {type(schema).__name__})")
@@ -195,12 +197,23 @@ def validate_response_schema(schema: object) -> None:
         raise ProviderInvalidRequest(
             f"response_schema: top-level type MUST be 'object' (got {schema_type!r})"
         )
+    # Full JSON Schema validity check at the boundary so a malformed
+    # schema raises ProviderInvalidRequest here instead of escaping as
+    # jsonschema.SchemaError at decode time. ValidationError covers
+    # instance-against-schema failures and is handled separately on the
+    # parse path.
+    try:
+        jsonschema.Draft202012Validator.check_schema(schema_dict)
+    except jsonschema.SchemaError as exc:
+        raise ProviderInvalidRequest(f"response_schema: not a valid JSON Schema: {exc.message}") from exc
 
 
 # Strict mode (OpenAI's response_format strict:true and the analogous
 # native-decoding paths in Anthropic / Gemini) requires the schema to
 # satisfy two rules at every nested level:
-#   1. additionalProperties is NOT true (false or absent).
+#   1. additionalProperties is EXPLICITLY false. OpenAI rejects schemas
+#      where the key is absent, since absence means JSON Schema's
+#      default of permitting extras.
 #   2. every key in `properties` is listed in `required`.
 # strict_mode_supported() walks the schema tree (object properties,
 # array items, anyOf/oneOf/allOf branches, $ref targets with cycle
@@ -272,7 +285,7 @@ def _strict_mode_check(
     )
 
     if is_object_type:
-        if schema_dict.get("additionalProperties") is True:
+        if schema_dict.get("additionalProperties") is not False:
             return False
         properties = schema_dict.get("properties")
         if properties is not None and not isinstance(properties, dict):
diff --git a/src/openarmature/llm/providers/openai.py b/src/openarmature/llm/providers/openai.py
@@ -42,6 +42,7 @@
 
 import hashlib
 import json
+import re
 import uuid
 from collections.abc import Sequence
 from typing import Any, Literal, cast
@@ -237,9 +238,8 @@ async def complete(
         # On the fallback path, the wire-side messages list is an
         # augmented COPY of the caller's messages — original messages
         # MUST NOT be mutated. _augment_messages_with_schema_directive
-        # builds a fresh list; the original instances are reused
-        # (immutable Pydantic models) so the caller's sequence is
-        # untouched.
+        # builds a fresh list and does not modify the reused Message
+        # instances in place; the caller's sequence is untouched.
         wire_messages: Sequence[Message] = messages
         if schema_dict is not None and self._force_prompt_augmentation_fallback:
             wire_messages = _augment_messages_with_schema_directive(messages, schema_dict)
@@ -461,24 +461,38 @@ def _normalize_response_schema(
     if response_schema is None:
         return None, None
     if isinstance(response_schema, type):
-        # Per the Protocol signature, the only class form accepted is
-        # a BaseModel subclass; non-BaseModel classes will AttributeError
-        # on model_json_schema below.
+        # Defensive runtime check: the Protocol signature accepts
+        # type[BaseModel], but Python doesn't enforce that at the call
+        # boundary. Reject non-BaseModel classes with a canonical error
+        # instead of letting AttributeError leak from model_json_schema.
+        if not issubclass(response_schema, BaseModel):  # pyright: ignore[reportUnnecessaryIsInstance]
+            raise ProviderInvalidRequest(
+                f"response_schema: class form MUST be a Pydantic BaseModel subclass "
+                f"(got {response_schema.__name__})"
+            )
         schema_dict = response_schema.model_json_schema()
         validate_response_schema(schema_dict)
         return schema_dict, response_schema
     validate_response_schema(response_schema)
     return response_schema, None
 
 
+# OpenAI's response_format.json_schema.name field is restricted to
+# letters, digits, underscores, and dashes with a max length of 64
+# characters. A JSON Schema title can be any string ("Person Record",
+# "User's Profile", etc.), so verbatim use risks a 400 on the wire.
+_OPENAI_SCHEMA_NAME_RE = re.compile(r"^[a-zA-Z0-9_-]{1,64}$")
+
+
 # Derive a stable identifier for the JSON Schema for OpenAI's
 # response_format.json_schema.name field. Uses the schema's `title`
-# when present (and a valid identifier-shaped string); otherwise
-# derives a deterministic short hash so the same schema always
-# produces the same name across calls.
+# when it satisfies the provider's name constraints; otherwise derives
+# a deterministic short hash so the same schema always produces the
+# same name across calls. Sanitizing-in-place would silently mutate
+# user intent; the hash is a more honest fallback.
 def _derive_schema_name(schema: dict[str, Any]) -> str:
     title = schema.get("title")
-    if isinstance(title, str) and title:
+    if isinstance(title, str) and _OPENAI_SCHEMA_NAME_RE.match(title):
         return title
     canonical = json.dumps(schema, sort_keys=True).encode("utf-8")
     return f"oa_schema_{hashlib.sha256(canonical).hexdigest()[:16]}"
@@ -546,9 +560,11 @@ def _parse_and_validate(
 # Construct a fresh message list with a schema directive added. The
 # directive is appended to the existing system message's content when
 # present, or prepended as a new system message otherwise. The caller's
-# original list is never mutated; Message instances are reused because
-# they are immutable Pydantic models. The serialized schema appears
-# verbatim in the directive so callers that need to verify the directive
+# original list is never mutated; Message instances are reused, and
+# this helper does not modify them in place (the message models are
+# not frozen Pydantic models, so the safety is structural, not
+# enforced by the type). The serialized schema appears verbatim in
+# the directive so callers that need to verify the directive
 # references the schema (conformance harnesses, observability spans)
 # can substring-match the canonical JSON form.
 def _augment_messages_with_schema_directive(
@@ -585,7 +601,10 @@ def _message_to_wire(msg: Message) -> dict[str, Any]:
                     "type": "function",
                     "function": {
                         "name": tc.name,
-                        "arguments": json.dumps(tc.arguments or {}),
+                        # Canonical compact form (no inter-token spaces). Matches
+                        # the spec's wire-mapping fixture (005, cases shape) and
+                        # the form OpenAI itself emits.
+                        "arguments": json.dumps(tc.arguments or {}, separators=(",", ":")),
                     },
                 }
                 for tc in msg.tool_calls
diff --git a/tests/conformance/harness/wire.py b/tests/conformance/harness/wire.py
@@ -42,9 +42,10 @@ def match_wire_body(
 ) -> None:
     """Recursive deep-equal between an actual wire-body value and an
     expected shape. Strings equal to ``"*"`` in the expected value match
-    any non-empty string in the actual value. Keys present in
-    ``expected`` MUST be present in ``actual`` and equal; keys present
-    in ``actual`` but absent from ``expected`` are allowed.
+    any non-empty string in the actual value. ``expected_wire_request``
+    is a literal compare: keys present in ``actual`` but absent from
+    ``expected`` are NOT allowed. Partial assertions belong in the
+    sibling ``expected_wire_request_checks`` block.
 
     Raises :class:`AssertionError` with a JSON-pointer-style path on
     mismatch.
@@ -61,6 +62,9 @@ def match_wire_body(
             raise AssertionError(f"wire mismatch at {path}: expected object, got {type(actual).__name__}")
         expected_map = cast("Mapping[str, Any]", expected)
         actual_map = cast("Mapping[str, Any]", actual)
+        extra = set(actual_map) - set(expected_map)
+        if extra:
+            raise AssertionError(f"wire mismatch at {path}: unexpected extra keys in actual: {sorted(extra)}")
         for key, exp_v in expected_map.items():
             if key not in actual_map:
                 raise AssertionError(f"wire mismatch at {path}: missing key {key!r}")
diff --git a/tests/conformance/test_llm_provider.py b/tests/conformance/test_llm_provider.py
@@ -37,6 +37,7 @@
     ProviderInvalidRequest,
     ProviderRateLimit,
     Response,
+    RuntimeConfig,
     SystemMessage,
     Tool,
     ToolCall,
@@ -410,23 +411,40 @@ async def _run_one_case(spec: Mapping[str, Any]) -> None:
         await provider.aclose()
 
 
+# Keys that may live as siblings to a ``call:`` block in a cases-shape
+# fixture but are conceptually call-level metadata. ``_iter_calls``
+# copies these from the case into the yielded call so the test runner
+# sees them in one place.
+_CASE_LEVEL_CALL_KEYS = (
+    "expected",
+    "expected_wire_request",
+    "expected_wire_request_checks",
+    "response_schema",
+    "retry_middleware",
+)
+
+
 def _iter_calls(spec: Mapping[str, Any]) -> Iterator[Mapping[str, Any]]:
-    """Yield each call dict with its ``expected`` block attached.
+    """Yield each call dict with its case-level metadata attached.
 
     Two shapes the fixtures use:
     - ``calls: [{operation, messages, expected, ...}]`` — call and
       expected are siblings inside each call entry.
     - ``call: {operation, messages, ...}`` + sibling ``expected: ...``
-      — the case-shape, where expected lives alongside the call.
-    Both are normalised here to a flat dict where ``expected`` is on
-    the call.
+      (and possibly ``expected_wire_request:``, ``response_schema:``,
+      ``retry_middleware:``) — the case-shape, where call-level
+      metadata lives alongside the call. All sibling keys in
+      ``_CASE_LEVEL_CALL_KEYS`` are folded into the call dict here so
+      the runner reads them from one place. The nested ``call`` block
+      takes precedence when both are present.
     """
     if "calls" in spec:
         yield from cast("list[Mapping[str, Any]]", spec["calls"])
     elif "call" in spec:
         call = dict(cast("Mapping[str, Any]", spec["call"]))
-        if "expected" in spec and "expected" not in call:
-            call["expected"] = spec["expected"]
+        for key in _CASE_LEVEL_CALL_KEYS:
+            if key in spec and key not in call:
+                call[key] = spec[key]
         yield call
     else:
         raise AssertionError("fixture has neither `calls` nor `call` block")
@@ -441,6 +459,8 @@ async def _run_one_call(
     expected = cast("Mapping[str, Any]", call_spec.get("expected") or {})
     response_schema = call_spec.get("response_schema")
     retry_mw_cfg = cast("Mapping[str, Any] | None", call_spec.get("retry_middleware"))
+    config_block = call_spec.get("config")
+    config = RuntimeConfig(**cast("Mapping[str, Any]", config_block)) if config_block else None
 
     if operation == "complete":
         # Per spec §3 "Validation timing" — complete() validates at
@@ -461,7 +481,7 @@ async def _run_one_call(
                 except ValidationError as ve:
                     raise ProviderInvalidRequest(str(ve)) from ve
                 await _maybe_with_retry(
-                    lambda: provider.complete(messages, tools, response_schema=response_schema),
+                    lambda: provider.complete(messages, tools, config, response_schema=response_schema),
                     retry_mw_cfg,
                 )
             _assert_raises_matches(excinfo, expected["raises"])
@@ -476,7 +496,7 @@ async def _run_one_call(
             messages_snapshot = [m.model_dump(mode="json") for m in messages]
             tools = _build_tools(cast("list[Mapping[str, Any]] | None", call_spec.get("tools")))
             response = await _maybe_with_retry(
-                lambda: provider.complete(messages, tools, response_schema=response_schema),
+                lambda: provider.complete(messages, tools, config, response_schema=response_schema),
                 retry_mw_cfg,
             )
             _assert_response_matches(response, cast("Mapping[str, Any]", expected.get("response") or {}))
diff --git a/tests/unit/test_structured_output.py b/tests/unit/test_structured_output.py
@@ -57,6 +57,21 @@ def test_validate_response_schema_rejects_missing_type() -> None:
         validate_response_schema({"properties": {"x": {"type": "integer"}}})
 
 
+def test_validate_response_schema_rejects_malformed_schema() -> None:
+    # `"type": "foobar"` is not a valid JSON Schema type keyword; the
+    # boundary check should catch this and raise ProviderInvalidRequest
+    # rather than letting jsonschema.SchemaError leak at parse time.
+    with pytest.raises(ProviderInvalidRequest, match="not a valid JSON Schema"):
+        validate_response_schema(
+            {
+                "type": "object",
+                "properties": {"x": {"type": "foobar"}},
+                "required": ["x"],
+                "additionalProperties": False,
+            }
+        )
+
+
 # ---------------------------------------------------------------------------
 # strict_mode_supported
 # ---------------------------------------------------------------------------
@@ -92,6 +107,18 @@ def test_strict_mode_additional_properties_true_fails() -> None:
     assert strict_mode_supported(schema) is False
 
 
+def test_strict_mode_missing_additional_properties_fails() -> None:
+    # OpenAI strict mode requires additionalProperties: false to be
+    # EXPLICITLY set; absence (the default for Pydantic-derived schemas)
+    # is not strict-compatible.
+    schema = {
+        "type": "object",
+        "properties": {"a": {"type": "string"}},
+        "required": ["a"],
+    }
+    assert strict_mode_supported(schema) is False
+
+
 def test_strict_mode_recurses_into_nested_object() -> None:
     schema: dict[str, Any] = {
         "type": "object",
@@ -132,10 +159,12 @@ def test_strict_mode_resolves_internal_ref() -> None:
                 "type": "object",
                 "properties": {"a": {"type": "string"}},
                 "required": ["a"],
+                "additionalProperties": False,
             }
         },
         "properties": {"inner": {"$ref": "#/$defs/Inner"}},
         "required": ["inner"],
+        "additionalProperties": False,
     }
     assert strict_mode_supported(schema) is True
 
@@ -153,7 +182,7 @@ def test_strict_mode_handles_ref_cycle() -> None:
     # Self-referential schema: each entry has a "children" key pointing
     # back to the same definition. Without cycle protection this would
     # recurse forever.
-    schema = {
+    schema: dict[str, Any] = {
         "type": "object",
         "$defs": {
             "Node": {
@@ -163,10 +192,12 @@ def test_strict_mode_handles_ref_cycle() -> None:
                     "children": {"$ref": "#/$defs/Node"},
                 },
                 "required": ["value", "children"],
+                "additionalProperties": False,
             }
         },
         "properties": {"root": {"$ref": "#/$defs/Node"}},
         "required": ["root"],
+        "additionalProperties": False,
     }
     assert strict_mode_supported(schema) is True
 
@@ -198,6 +229,28 @@ def test_derive_schema_name_ignores_empty_title() -> None:
     assert _derive_schema_name(schema).startswith("oa_schema_")
 
 
+def test_derive_schema_name_falls_back_on_title_with_spaces() -> None:
+    # OpenAI's name field rejects spaces; the hash fallback fires.
+    schema = {
+        "type": "object",
+        "title": "Person Record",
+        "properties": {"x": {"type": "string"}},
+        "required": ["x"],
+    }
+    assert _derive_schema_name(schema).startswith("oa_schema_")
+
+
+def test_derive_schema_name_falls_back_on_title_too_long() -> None:
+    # OpenAI's name field has a 64-char cap; longer titles fall back.
+    schema = {
+        "type": "object",
+        "title": "A" * 65,
+        "properties": {"x": {"type": "string"}},
+        "required": ["x"],
+    }
+    assert _derive_schema_name(schema).startswith("oa_schema_")
+
+
 # ---------------------------------------------------------------------------
 # _augment_messages_with_schema_directive
 # ---------------------------------------------------------------------------
@@ -273,6 +326,24 @@ def handler(request: httpx.Request) -> httpx.Response:
     return httpx.MockTransport(handler)
 
 
+async def test_non_basemodel_class_raises_provider_invalid_request() -> None:
+    transport = _mock_chat_completion_response('{"x":1}')
+    provider = OpenAIProvider(
+        base_url="http://mock-llm.test",
+        model="test-model",
+        api_key="test-key",
+        transport=transport,
+    )
+    try:
+        with pytest.raises(ProviderInvalidRequest, match="BaseModel subclass"):
+            await provider.complete(
+                [UserMessage(content="x")],
+                response_schema=str,  # type: ignore[arg-type]
+            )
+    finally:
+        await provider.aclose()
+
+
 async def test_pydantic_class_returns_validated_instance() -> None:
     transport = _mock_chat_completion_response('{"name":"Alice","age":30}')
     provider = OpenAIProvider(