openarmature-python/tests/conformance/harness/fixtures.py at a57e8bd0bc4a00a590f92e7e691940a3eb32ffa1 · LunarCommand/openarmature-python · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
"""Typed fixture root models.

Per the Phase 0 plan: every YAML fixture under
``openarmature-spec/spec/<capability>/conformance/`` lands as one of three
typed shapes. The shape is chosen by a callable discriminator that inspects
the raw dict's top-level keys (no tag field is present in the YAML).

The three shapes:

- :class:`LlmProviderFixture` — ``mock_provider`` is at the top level. Tests
  the stateless ``complete()`` / ``ready()`` operations of the
  ``llm-provider`` capability against canned wire responses. May contain
  ``cases:`` for table-style sub-cases that share the mock provider.

- :class:`CasesFixture` — top-level ``cases:`` list (and no
  ``mock_provider``). Each case carries its own graph definition and
  expected block. Optional shared ``subgraph`` / ``subgraph_with_idx``
  blocks at the top level apply across cases.

- :class:`GraphFixture` — direct graph at the top level (state + entry +
  nodes + edges + initial_state + expected). Optional ``run_count`` for
  determinism fixtures, plus a long tail of optional harness directives
  (``observers``, ``middleware``, ``caller_correlation_id``,
  ``detached_subgraphs``, etc.).

Sub-shapes (state field schemas, node directives, edge specs, middleware
specs, observer specs, expected blocks) live in :mod:`directives` and
:mod:`expectations`. The split is for readability; what's authoritative is
the union of all three shapes here parsing every fixture in the spec
submodule with ``extra="forbid"`` rejecting unknown keys at every level.
"""

from __future__ import annotations

from typing import Annotated, Any, Literal

from pydantic import BaseModel, ConfigDict, Discriminator, Tag

from .directives import (
    EdgeSpec,
    LlmCallSpec,
    MiddlewareConfig,
    MockProviderConfig,
    MockResponse,
    NodeSpec,
    ObserverSpec,
    StateSchema,
)
from .expectations import ExpectedBlock, LlmProviderExpected
from .prompt_management import PromptManagementFixture


class _ForbidExtras(BaseModel):
    """Common base — strict by default. Catches both fixture authors and us
    drifting from the spec; new directives surface as parse errors at the
    point they're introduced rather than getting silently dropped."""

    model_config = ConfigDict(extra="forbid")


# ---------------------------------------------------------------------------
# Shared sub-shapes
# ---------------------------------------------------------------------------


class SubgraphDefinition(BaseModel):
    """A subgraph at the fixture's top level (singular ``subgraph:`` form
    or one entry of the plural ``subgraphs:`` map). Carries its own state
    schema, nodes, and edges — structurally a mini-graph. Permissive
    extras to absorb subgraph-local middleware blocks (pipeline-utilities/
    020) and any future extension."""

    model_config = ConfigDict(extra="allow")

    name: str | None = None  # singular `subgraph:` form
    state: StateSchema
    entry: str
    nodes: dict[str, NodeSpec]
    edges: list[EdgeSpec]
    middleware: MiddlewareConfig | None = None


class CaseSpec(BaseModel):
    """One sub-case in a ``CasesFixture`` (or in the ``cases:`` block of an
    LlmProviderFixture).

    The shape of a case is fluid — checkpointing fixtures (027–031) bring
    in ``checkpointer``/``first_run_expected_error``/``saved_record_assertions``/
    ``resume`` blocks; llm-provider cases bring in ``call`` /
    ``expected_wire_request``; graph-engine ``007-compile-errors`` cases
    have ``graph:`` wrapping the graph + ``expected_compile_error``;
    observability cases inherit any harness directive a top-level
    ``GraphFixture`` could carry. Permissive extras so the parse keeps
    pace with case-shape evolution without quarterly model edits.
    """

    model_config = ConfigDict(extra="allow")

    name: str
    description: str | None = None
    # graph-engine 007 compile-errors: a case wraps the malformed graph
    # under a `graph:` key alongside `expected_compile_error`.
    graph: dict[str, Any] | None = None
    expected_compile_error: str | None = None
    # The graph-shaped fields when a case carries the graph inline (rather
    # than under ``graph:``).
    state: StateSchema | None = None
    entry: str | None = None
    nodes: dict[str, NodeSpec] | None = None
    edges: list[EdgeSpec] | None = None
    initial_state: dict[str, Any] | None = None
    subgraph: SubgraphDefinition | None = None
    subgraphs: dict[str, SubgraphDefinition] | None = None
    middleware: MiddlewareConfig | None = None
    observers: list[ObserverSpec] | None = None
    expected: ExpectedBlock | None = None
    expected_error: dict[str, Any] | None = None
    # llm-provider sub-cases.
    call: LlmCallSpec | None = None
    expected_wire_request: dict[str, Any] | None = None
    # Checkpointing fixtures (024-031, 048-054). Two shapes:
    #   - ``str`` (e.g. ``"in_memory"``): backend kind selector.
    #   - ``dict``: backend kind + config knobs (e.g. fixture 054's
    #     ``{kind: in_memory_batched, fan_out_internal_save_batching: ...}``).
    checkpointer: str | dict[str, Any] | None = None
    first_run_expected_error: dict[str, Any] | None = None
    saved_record_assertions: dict[str, Any] | None = None
    latest_record_assertions: dict[str, Any] | None = None
    resume: dict[str, Any] | None = None
    invariants: dict[str, Any] | None = None
    # Either an int (run count) or a list of run configs — fixtures vary.
    populate_checkpointer_via_runs: Any = None
    invoke_with: dict[str, Any] | None = None
    caller_correlation_id: str | None = None
    # observability — mock LLM responses + per-case run config.
    mock_llm: list[MockResponse] | None = None
    invocations: int | None = None


# ---------------------------------------------------------------------------
# LlmProviderFixture
# ---------------------------------------------------------------------------


class LlmProviderFixture(_ForbidExtras):
    """A fixture under ``spec/llm-provider/conformance/``.

    Either ``calls`` is at the top level (single-case) or wrapped in
    ``cases`` (table-style). ``mock_provider`` is always present and
    discriminates this shape from the graph-shaped fixtures.
    """

    mock_provider: MockProviderConfig
    calls: list[LlmCallSpec] | None = None
    cases: list[CaseSpec] | None = None


# ---------------------------------------------------------------------------
# CasesFixture
# ---------------------------------------------------------------------------


class CasesFixture(_ForbidExtras):
    """A fixture whose top level is ``cases:`` rather than a single graph.

    Used by ``007-compile-errors``, the checkpointing fixtures (024–031),
    and the determinism / multi-run observability fixtures. Optional shared
    ``subgraph`` / ``subgraph_with_idx`` / ``subgraphs`` at the top level
    apply across all cases. Any other top-level key not listed here is
    rejected.
    """

    cases: list[CaseSpec]
    # Shared graph-shape blocks that apply across every case. The singular
    # `subgraph` / `subgraph_with_idx` and the plural `subgraphs` map
    # (name -> graph-spec, as the parallel-branches fixtures use) may all
    # appear at the cases-fixture top level. Fixture 064 (failure-isolation
    # cause fidelity) is the first to share a plural `subgraphs:` across
    # cases.
    subgraph: SubgraphDefinition | None = None
    subgraph_with_idx: SubgraphDefinition | None = None
    subgraphs: dict[str, SubgraphDefinition] | None = None


# ---------------------------------------------------------------------------
# GraphFixture
# ---------------------------------------------------------------------------


class GraphFixture(_ForbidExtras):
    """A fixture whose top level IS a single graph.

    Covers the bulk of graph-engine, pipeline-utilities, and observability
    fixtures. Most fields are optional because different fixtures exercise
    different facets of the graph contract.
    """

    # Graph definition (graph-engine + most others).
    state: StateSchema
    entry: str | None = None
    nodes: dict[str, NodeSpec] | None = None
    edges: list[EdgeSpec] | None = None
    initial_state: dict[str, Any] | None = None
    expected: ExpectedBlock | None = None

    # Legacy: top-level expected_error in graph-engine fixtures 008/009.
    expected_error: dict[str, Any] | None = None

    # Subgraph definitions — singular form for graph-engine; plural map for
    # the multi-subgraph cases in observability/008, observability/010, and
    # pipeline-utilities/029.
    subgraph: SubgraphDefinition | None = None
    subgraphs: dict[str, SubgraphDefinition] | None = None
    # Used by pipeline-utilities/020 (fan-out instances expose their idx).
    subgraph_with_idx: SubgraphDefinition | None = None

    # graph-engine §6 observers (since proposal 0003).
    observers: list[ObserverSpec] | None = None

    # pipeline-utilities §6 middleware (proposal 0004) and §10 checkpointer
    # registration (proposal 0008).
    middleware: MiddlewareConfig | None = None
    checkpointer: str | None = None
    clock_stub: dict[str, Any] | None = None

    # Determinism fixtures — graph-engine/010 and pipeline-utilities/011.
    run_count: int | None = None

    # observability / pipeline-utilities cross-cutting harness directives.
    # These are inputs to the test harness, NOT the engine.
    caller_correlation_id: str | None = None
    detached_subgraphs: list[str] | None = None
    detached_fan_outs: list[str] | None = None
    disable_llm_spans: bool | None = None
    # Proposal 0024 (v0.17.0): observer-level opt-outs for the new
    # §5.5.1 payload and §5.5.2/§5.5.3 GenAI semconv attribute sets.
    # ``disable_provider_payload`` defaults to True per §5.5.4 — fixtures
    # that EXERCISE payload emission set it false explicitly (013-018).
    # ``disable_genai_semconv`` defaults to False — fixture 021 sets
    # it true to verify the opt-out.
    disable_provider_payload: bool | None = None
    disable_genai_semconv: bool | None = None
    # Proposal 0024 (v0.17.0, fixture 020): provider-level configuration
    # overrides — ``provider.genai_system`` overrides the default
    # ``"openai"`` value of ``gen_ai.system`` for OpenAI-compatible
    # providers serving non-OpenAI endpoints (vLLM, LM Studio, …).
    provider: dict[str, Any] | None = None
    mock_llm: list[MockResponse] | None = None
    caller_global_otel_active: bool | None = None
    # Two shapes:
    # - ``int``: run-count for observability multi-run fixtures (legacy).
    # - ``list[dict]``: per-invocation specs for proposal 0010 §6 Drain
    #   cross-invocation cleanliness fixtures (e.g., fixture 024). Each
    #   entry carries its own ``initial_state``, ``drain``, ``expected``.
    invocations: int | list[dict[str, Any]] | None = None
    # Proposal 0010 §6 Drain — the ``invoke`` directive wraps the
    # ``drain.timeout_seconds`` parameter for single-invocation
    # drain-timeout fixtures (022, 023, 025). Multi-invocation fixture
    # 024 uses the ``invocations`` array above instead.
    invoke: dict[str, Any] | None = None
    # Proposal 0010 §6 Drain — top-level invariants applied across all
    # invocations of a multi-invocation fixture (e.g.,
    # ``second_invocation_drain_independent_of_first`` on fixture 024).
    # Single-invocation fixtures put their invariants under
    # ``expected.invariants`` (the field already on ExpectedBlock).
    invariants: dict[str, Any] | None = None


# ---------------------------------------------------------------------------
# Discriminator + root union
# ---------------------------------------------------------------------------


def _discriminate_fixture(
    value: Any,
) -> Literal["llm_provider", "prompt_management", "cases", "graph"]:
    """Pick the fixture shape from a raw YAML dict.

    Order matters:

    - ``mock_provider`` wins over ``cases`` because some llm-provider
      fixtures (e.g. 003-message-validation) have BOTH — ``mock_provider``
      is the load-bearing discriminator, ``cases`` is the table style.
    - ``backends`` at the top level (without ``mock_provider``) picks
      the prompt-management shape. Spec/prompt-management fixtures
      always carry ``backends:``.

    Also handle the serialization path (where the value is a concrete
    variant) so a future ``model_dump`` through the top-level union
    doesn't fall through to ``graph`` and warn.
    """
    if isinstance(value, LlmProviderFixture):
        return "llm_provider"
    if isinstance(value, PromptManagementFixture):
        return "prompt_management"
    if isinstance(value, CasesFixture):
        return "cases"
    if isinstance(value, GraphFixture):
        return "graph"
    if isinstance(value, dict):
        if "mock_provider" in value:
            return "llm_provider"
        # PM fixtures uniquely have ``backends:`` AND ``calls:`` and
        # none of the graph-shape keys. Co-occurrence is the
        # discriminator until a spec-side ``kind:`` field lands —
        # checking ``backends:`` alone would silently misroute any
        # future fixture that introduces a backends list for some
        # other purpose.
        if (
            "backends" in value
            and ("calls" in value or "cases" in value)
            and not any(k in value for k in ("nodes", "edges", "state", "entry"))
        ):
            # Per proposal 0046 (v0.38.0) the chat-prompt fixtures
            # (017-031) carry ``backends:`` + top-level ``cases:``
            # instead of ``backends:`` + ``calls:``.  Route both
            # shapes to ``prompt_management``.
            return "prompt_management"
        if "cases" in value:
            return "cases"
    return "graph"


Fixture = Annotated[
    Annotated[LlmProviderFixture, Tag("llm_provider")]
    | Annotated[PromptManagementFixture, Tag("prompt_management")]
    | Annotated[CasesFixture, Tag("cases")]
    | Annotated[GraphFixture, Tag("graph")],
    Discriminator(_discriminate_fixture),
]


__all__ = [
    "CaseSpec",
    "CasesFixture",
    "Fixture",
    "GraphFixture",
    "LlmProviderExpected",
    "LlmProviderFixture",
    "PromptManagementFixture",
    "SubgraphDefinition",
]