test(conformance): prompt-management harness and 12 fixtures

chris-colinsky · chris-colinsky · commit 9bdbcb7e2173 · 2026-05-15T18:59:30.000-07:00
Adds prompt-management as the fifth conformance capability:

- harness/prompt_management.py — typed YAML models for the new
  fixture shape (backends + manager + calls with target /
  operation / capture_as, plus per-call and top-level expected
  blocks for raises / result_equivalence / prompt_group /
  rendered_hash_equal / rendered_hash_different).
- harness/fixtures.py — PromptManagementFixture added to the
  discriminated union; the discriminator recognizes top-level
  'backends:' (without 'mock_provider:') as the prompt-management
  shape.
- harness/loader.py — 'prompt-management' added to CAPABILITIES
  so test_fixture_parsing.py discovers and parses the new
  fixtures.

test_prompt_management.py drives all 12 spec fixtures
(001-fetch-success through 012-prompt-result-rendered-hash-stability)
against the real PromptManager + a MockPromptBackend that
implements the protocol with optional simulate_unavailable +
preloaded prompts + a call_count for fixtures that assert
fallback chain visits. All 12 fixtures pass.
diff --git a/tests/conformance/harness/fixtures.py b/tests/conformance/harness/fixtures.py
@@ -47,6 +47,7 @@
     StateSchema,
 )
 from .expectations import ExpectedBlock, LlmProviderExpected
+from .prompt_management import PromptManagementFixture
 
 
 class _ForbidExtras(BaseModel):
@@ -234,34 +235,45 @@ class GraphFixture(_ForbidExtras):
 # ---------------------------------------------------------------------------
 
 
-def _discriminate_fixture(value: Any) -> Literal["llm_provider", "cases", "graph"]:
+def _discriminate_fixture(
+    value: Any,
+) -> Literal["llm_provider", "prompt_management", "cases", "graph"]:
     """Pick the fixture shape from a raw YAML dict.
 
-    Order matters: ``mock_provider`` wins over ``cases`` because some
-    llm-provider fixtures (e.g. 003-message-validation) have BOTH —
-    ``mock_provider`` is the load-bearing discriminator, ``cases`` is just
-    the table style for sub-cases.
+    Order matters:
+
+    - ``mock_provider`` wins over ``cases`` because some llm-provider
+      fixtures (e.g. 003-message-validation) have BOTH — ``mock_provider``
+      is the load-bearing discriminator, ``cases`` is the table style.
+    - ``backends`` at the top level (without ``mock_provider``) picks
+      the prompt-management shape. Spec/prompt-management fixtures
+      always carry ``backends:``.
 
     Also handle the serialization path (where the value is a concrete
     variant) so a future ``model_dump`` through the top-level union
     doesn't fall through to ``graph`` and warn.
     """
     if isinstance(value, LlmProviderFixture):
         return "llm_provider"
+    if isinstance(value, PromptManagementFixture):
+        return "prompt_management"
     if isinstance(value, CasesFixture):
         return "cases"
     if isinstance(value, GraphFixture):
         return "graph"
     if isinstance(value, dict):
         if "mock_provider" in value:
             return "llm_provider"
+        if "backends" in value:
+            return "prompt_management"
         if "cases" in value:
             return "cases"
     return "graph"
 
 
 Fixture = Annotated[
     Annotated[LlmProviderFixture, Tag("llm_provider")]
+    | Annotated[PromptManagementFixture, Tag("prompt_management")]
     | Annotated[CasesFixture, Tag("cases")]
     | Annotated[GraphFixture, Tag("graph")],
     Discriminator(_discriminate_fixture),
@@ -275,5 +287,6 @@ def _discriminate_fixture(value: Any) -> Literal["llm_provider", "cases", "graph
     "GraphFixture",
     "LlmProviderExpected",
     "LlmProviderFixture",
+    "PromptManagementFixture",
     "SubgraphDefinition",
 ]
diff --git a/tests/conformance/harness/loader.py b/tests/conformance/harness/loader.py
@@ -31,6 +31,7 @@
     "llm-provider",
     "pipeline-utilities",
     "observability",
+    "prompt-management",
 )
 
 CONFORMANCE_ROOT = Path(__file__).resolve().parents[3] / "openarmature-spec" / "spec"
diff --git a/tests/conformance/harness/prompt_management.py b/tests/conformance/harness/prompt_management.py
@@ -0,0 +1,174 @@
+"""Typed YAML models for prompt-management conformance fixtures.
+
+Fixture shape (different from the llm-provider / graph shapes):
+
+- ``backends:`` — list of mock backend specs (each with ``name``,
+  optional ``simulate_unavailable``, and a list of ``prompts``).
+- ``manager:`` — optional manager composition (a list of backend
+  names, in fallback order).
+- ``calls:`` — list of operations to drive. Each call has a
+  ``target`` (``{backend: <name>}`` for direct backend operations,
+  or ``manager`` for manager operations), an ``operation``, inputs,
+  optional ``capture_as`` (binds the operation's result to a name
+  usable by later calls / final expectations), and optional
+  per-call ``expected``.
+- ``expected:`` — optional top-level expectation block for
+  PromptGroup shape or cross-call result-equivalence assertions
+  that need access to ``capture_as`` bindings.
+"""
+
+from __future__ import annotations
+
+from typing import Any, Literal
+
+from pydantic import BaseModel, ConfigDict
+
+
+class _StrictModel(BaseModel):
+    model_config = ConfigDict(extra="forbid")
+
+
+class _PermissiveModel(BaseModel):
+    """For fixture sub-shapes that vary across fixtures and don't
+    warrant a per-shape enumeration."""
+
+    model_config = ConfigDict(extra="allow")
+
+
+# ---------------------------------------------------------------------------
+# Backend / manager configuration
+# ---------------------------------------------------------------------------
+
+
+class FixturePromptSpec(_StrictModel):
+    name: str
+    label: str
+    version: str
+    template: str
+    template_hash: str
+
+
+class FixtureBackendSpec(_StrictModel):
+    name: str
+    prompts: list[FixturePromptSpec] = []
+    simulate_unavailable: bool = False
+
+
+class FixtureManagerSpec(_StrictModel):
+    backends: list[str]
+
+
+# ---------------------------------------------------------------------------
+# Call targets, operations, and expectations
+# ---------------------------------------------------------------------------
+
+
+class BackendTarget(_StrictModel):
+    backend: str
+
+
+CallTarget = BackendTarget | Literal["manager", "construct_prompt_group"]
+
+
+class FixtureExpectedRaises(_PermissiveModel):
+    category: str
+    # Optional extra carries — fixture 005 uses ``description_mentions``,
+    # ``name``, ``version``, ``label``. fixture 008 uses
+    # ``secondary_backend_call_count``. Permissive on this shape so
+    # fixtures evolve.
+    carries: dict[str, Any] | None = None
+
+
+class FixtureExpectedPrompt(_PermissiveModel):
+    """Per-call ``expected.prompt`` shape (fetch ops)."""
+
+
+class FixtureExpectedPromptResult(_PermissiveModel):
+    """Per-call ``expected.prompt_result`` shape (render / get ops)."""
+
+
+class FixtureExpectedPerCall(_StrictModel):
+    prompt: FixtureExpectedPrompt | None = None
+    prompt_result: FixtureExpectedPromptResult | None = None
+    raises: FixtureExpectedRaises | None = None
+    # Fixture 008's extra: assert how many times the secondary
+    # backend's fetch was called. Lives alongside ``raises``.
+    secondary_backend_call_count: int | None = None
+    # Fixture 009's extra: assert per-backend call counts (named
+    # backends → expected call count) after a fetch that exhausts
+    # all of them.
+    backend_call_counts: dict[str, int] | None = None
+
+
+class FixtureCall(_StrictModel):
+    target: CallTarget
+    # ``operation`` is required for fetch / render / get calls. The
+    # ``construct_prompt_group`` shape uses the target as the operation
+    # indicator (no separate operation field on the call).
+    operation: Literal["fetch", "render", "get"] | None = None
+    name: str | None = None
+    label: str | None = None
+    variables: dict[str, Any] | None = None
+    # Render-only inputs — either an inline ``fetched_prompt`` (which
+    # the harness fetches first, then renders) or a ``fetched_prompt_ref``
+    # pointing at an earlier ``capture_as``.
+    fetched_prompt: dict[str, str] | None = None
+    fetched_prompt_ref: str | None = None
+    # construct_prompt_group-only inputs.
+    group_name: str | None = None
+    members_refs: list[str] | None = None
+    capture_as: str | None = None
+    expected: FixtureExpectedPerCall | None = None
+
+
+# ---------------------------------------------------------------------------
+# Top-level expected
+# ---------------------------------------------------------------------------
+
+
+class FixtureExpectedPromptGroup(_PermissiveModel):
+    """Top-level ``expected.prompt_group`` shape (fixture 011)."""
+
+    of: str
+    group_name: str
+    member_count: int
+    member_order_preserved: bool | None = None
+    member_names: list[str] | None = None
+
+
+class FixtureExpectedResultEquivalence(_PermissiveModel):
+    """Top-level ``expected.result_equivalence`` shape (fixtures 006,
+    010, 012). Asserts equality across two or more captured results on
+    a configurable set of fields."""
+
+    of: list[str]
+    fields_must_match: list[str]
+    fields_may_differ: list[str] = []
+    # fixture 012 — assert two different captures have a DIFFERENT
+    # value on a given field.
+    fields_must_differ: list[str] = []
+
+
+class FixtureExpectedTopLevel(_StrictModel):
+    prompt_group: FixtureExpectedPromptGroup | None = None
+    result_equivalence: FixtureExpectedResultEquivalence | None = None
+    # Some fixtures (012) have multiple result-equivalence blocks; keep
+    # a plural list-form too. Empty by default.
+    result_equivalences: list[FixtureExpectedResultEquivalence] = []
+    # Fixture 012's per-pair rendered_hash equality / inequality
+    # assertions. Each entry is a 2-element list of capture names; the
+    # pair MUST share (resp. differ on) ``rendered_hash``.
+    rendered_hash_equal: list[list[str]] = []
+    rendered_hash_different: list[list[str]] = []
+
+
+# ---------------------------------------------------------------------------
+# Fixture root
+# ---------------------------------------------------------------------------
+
+
+class PromptManagementFixture(_StrictModel):
+    backends: list[FixtureBackendSpec]
+    manager: FixtureManagerSpec | None = None
+    calls: list[FixtureCall]
+    expected: FixtureExpectedTopLevel | None = None
diff --git a/tests/conformance/test_prompt_management.py b/tests/conformance/test_prompt_management.py

Original file line number	Diff line number	Diff line change
`@@ -31,6 +31,7 @@`
`31`	`31`	`"llm-provider",`
`32`	`32`	`"pipeline-utilities",`
`33`	`33`	`"observability",`
	`34`	`+ "prompt-management",`
`34`	`35`	`)`
`35`	`36`
`36`	`37`	`CONFORMANCE_ROOT = Path(__file__).resolve().parents[3] / "openarmature-spec" / "spec"`