diff --git a/openarmature-spec b/openarmature-spec index e3ca54c..9005ca0 160000 --- a/openarmature-spec +++ b/openarmature-spec @@ -1 +1 @@ -Subproject commit e3ca54c7b49d6d0007813375865e8716a8f0070f +Subproject commit 9005ca0efaddd845c2903f1a0eafd10d985439a5 diff --git a/tests/conformance/harness/__init__.py b/tests/conformance/harness/__init__.py new file mode 100644 index 0000000..3b2b3b0 --- /dev/null +++ b/tests/conformance/harness/__init__.py @@ -0,0 +1,36 @@ +"""Conformance fixture harness — typed parsing for the four spec capabilities. + +Phase 0 (per the implementation plan): every fixture under +``openarmature-spec/spec//conformance/`` lands as a typed pydantic +config. Phases 1–6 add runtime interpretation under ``harness/runtime/``; +they never re-touch parsing. + +Public surface: + +- :func:`loader.load_fixture` — parse one YAML path into a typed fixture. +- :func:`loader.discover_fixtures` — auto-discover fixture paths across the + four capability directories on the spec submodule. +- :class:`fixtures.Fixture` — the root discriminated union + (``LlmProviderFixture | CasesFixture | GraphFixture``). +- :class:`skip.SkipReason` — structured "fixture needs directives X, current + phase doesn't support them" used by the test runner to skip cleanly. +""" + +from .fixtures import ( + CasesFixture, + Fixture, + GraphFixture, + LlmProviderFixture, +) +from .loader import discover_fixtures, load_fixture +from .skip import SkipReason + +__all__ = [ + "CasesFixture", + "Fixture", + "GraphFixture", + "LlmProviderFixture", + "SkipReason", + "discover_fixtures", + "load_fixture", +] diff --git a/tests/conformance/harness/directives.py b/tests/conformance/harness/directives.py new file mode 100644 index 0000000..b2824f4 --- /dev/null +++ b/tests/conformance/harness/directives.py @@ -0,0 +1,517 @@ +"""Typed directive sub-models — the shapes referenced inside fixtures. + +Phase 0 typing strategy: model every key in every fixture, but use +``dict[str, Any]`` for genuinely polymorphic payloads (notably the inner +``state``/``nodes``/``edges`` of recursive subgraph definitions, and the +update payloads themselves which are arbitrary state-shaped dicts). The +load-bearing invariant is that every TOP-LEVEL and DIRECTIVE key is +known — a fixture introducing a new directive that we haven't modelled +fails parsing immediately, and that's exactly what we want. + +The submodels here (``NodeSpec``, ``MiddlewareSpec``, etc.) are referenced +from :mod:`fixtures` and :mod:`expectations`. The split is for readability; +all of these could live in one file but the file would push 800 lines. +""" + +from __future__ import annotations + +from typing import Annotated, Any, Literal + +from pydantic import ( + BaseModel, + ConfigDict, + Field, + model_validator, +) + + +class _ForbidExtras(BaseModel): + """Strict — used for the structural skeleton (state schema, node primary + directive set, edges, observer registration, middleware config split). + Catches new directives the spec adds at the load-bearing places.""" + + model_config = ConfigDict(extra="forbid") + + +class _AllowExtras(BaseModel): + """Permissive — used for payload-shape models (mock LLM responses, + middleware-specific params, flaky/fan-out config). Validates KNOWN + keys' types but doesn't reject unknown ones — the spec evolves these + payloads frequently and modelling every parameter exhaustively + creates churn without proportional value. The Phase 0 strictness + contract sits at the directive STRUCTURE level (above), not the + parameter-bag level (here).""" + + model_config = ConfigDict(extra="allow") + + +# --------------------------------------------------------------------------- +# State schema (state.fields) +# --------------------------------------------------------------------------- + + +class StateFieldSpec(_ForbidExtras): + """A single state field declaration. + + The ``alt_reducer`` knob exists only for ``graph-engine/007-compile-errors``'s + ``conflicting_reducers`` case — fixtures intentionally declare two reducers + on one field to verify the engine fails compile with the right category. + """ + + type: str + default: Any = None + reducer: str | None = None + alt_reducer: str | None = None + + +class StateSchema(_ForbidExtras): + fields: dict[str, StateFieldSpec] + + +# --------------------------------------------------------------------------- +# Edge specs +# --------------------------------------------------------------------------- + + +class EdgeSpec(_AllowExtras): + """One edge in a graph definition. + + The spec defines static (``from``/``to``) and conditional + (``from``/``condition``) edges; observability/011 also uses a + ``when``-shaped predicate. Schema is permissive here so all forms + parse — Phase 1 (engine retrofit) interprets each shape against + the engine's edge model. + """ + + from_: str = Field(alias="from") + to: str | None = None + condition: dict[str, Any] | None = None + when: dict[str, Any] | None = None + + model_config = ConfigDict(extra="allow", populate_by_name=True) + + +# --------------------------------------------------------------------------- +# Node directive sub-models +# --------------------------------------------------------------------------- + + +class FailureSpec(_AllowExtras): + """One entry in a flaky node's ``failure_sequence``. ``transient: true`` + + ``category`` triggers a transient retry-classifier-friendly raise; + ``transient: false`` raises a non-transient instead.""" + + transient: bool + category: str | None = None + message: str | None = None + + +class FlakySpec(_AllowExtras): + """Base flaky directive shapes. + + Two known sub-shapes share the ``flaky:`` key: + + 1. Sequence form (pipeline-utilities/007 etc.): ``failure_sequence`` of + per-attempt failures + ``success_update`` for the success state. + 2. Compact form (pipeline-utilities/029): ``fail_first_invocation_only`` + boolean + ``on_success`` state update — used in checkpoint fixtures + where the failure is keyed to first invocation. + + ``flaky_resume_aware`` (pipeline-utilities/027) is a *separate* node + directive even though it lives under a node named ``flaky`` in that + fixture — see :class:`FlakyResumeAwareSpec`. + """ + + failure_sequence: list[FailureSpec | None] | None = None + success_update: dict[str, Any] | None = None + fail_first_invocation_only: bool | None = None + on_success: dict[str, Any] | None = None + + +class FlakyByIndexSpec(_AllowExtras): + """Fan-out variant: failure depends on ``fan_out_index``. + + Two presence patterns: + + - ``fail_when_idx`` (int) — only that index fails. + - ``fail_count_per_idx`` (int) — every index fails this many attempts + before succeeding. + + Both come with ``category`` (transient category) and ``success_compute`` + (the success state shape). + """ + + fail_when_idx: int | None = None + fail_count_per_idx: int | None = None + category: str | None = None + success_compute: dict[str, Any] + + +class FlakyPerIndexSpec(_AllowExtras): + """Checkpoint-resume variant: indices in ``fail_first_run_indices`` fail + on the first invocation; everyone succeeds on subsequent runs.""" + + fail_first_run_indices: list[int] + success_compute: dict[str, Any] + + +class FlakyInstanceOnlySpec(_AllowExtras): + """Instance-middleware variant: each fan-out instance fails its first + ``fail_count_per_instance`` whole-instance invocations, then succeeds.""" + + fail_count_per_instance: int + category: str + success_compute: dict[str, Any] + + +class FlakyResumeAwareSpec(_AllowExtras): + """Checkpoint-resume + retry variant: fails N times on the first + invocation, then on resume (any later invocation) fails M times before + succeeding. Used to verify ``attempt_index`` resets on resume.""" + + fail_first_invocation_count: int + fail_resumed_invocation_count: int + category: str + on_success: dict[str, Any] + + +class UpdateFromFieldSpec(_ForbidExtras): + """Mock computation: result_field = input_field × multiplier. + + Used by fan-out fixtures to give instances a deterministic, + parameterizable computation without needing real LLM calls. The harness + mock interprets this directive at runtime. + """ + + # Free-form: some fixtures use ``{result: x, multiplier: 2}``, others + # ``{score: item}`` with no multiplier. Phase 4 (fan-out runtime) reads + # whichever keys are present. + model_config = ConfigDict(extra="allow") + + +class FanOutSpec(_AllowExtras): + """A fan-out node's configuration. + + Two mutually exclusive modes: + + - ``items_field`` mode — instance count = ``len(parent_state[items_field])``; + each instance's input is ``items_field[i]`` projected into ``item_field``. + - ``count`` mode — instance count = ``count`` (literal int OR callable); + no per-item data. + + Cross-cutting: ``concurrency`` (default 10), ``error_policy`` (default + ``fail_fast``; alternative ``collect``), ``on_empty`` (default ``raise``; + alternative ``noop``), ``count_field`` (writes resolved count to this + parent field), ``errors_field`` (for ``collect`` mode), and + ``instance_middleware`` (whole-instance retry seam). + """ + + subgraph: str + # Mode A — items. + items_field: str | None = None + item_field: str | None = None + # Mode B — count. Permissive ``Any`` because fixtures express + # callable counts as e.g. ``{callable: state_field, field: workers}``. + count: Any = None + count_field: str | None = None + # Common. ``concurrency`` accepts the same shapes as ``count``. + collect_field: str | None = None + target_field: str | None = None + concurrency: Any = None + error_policy: Literal["fail_fast", "collect"] | None = None + on_empty: Literal["raise", "noop"] | None = None + errors_field: str | None = None + instance_middleware: list[MiddlewareSpec] | None = None + + +class CallsLlmSpec(_AllowExtras): + """LLM-using node: sends ``messages`` to the harness's mock provider + and stores the response (assistant content) in ``stores_response_in``. + Used by observability fixtures to verify LLM-provider span emission.""" + + messages: list[dict[str, Any]] + stores_response_in: str + + +class EmitsLogSpec(_AllowExtras): + """Additive companion: the node emits a log record alongside its + state update. Verified by observability fixture 010 (Logs Bridge).""" + + message: str + level: Literal["DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"] + + +class GlobalTracerSpec(_AllowExtras): + """Additive companion: the node ALSO emits a span via the OTel global + tracer (in addition to whatever it does normally). Used by + observability fixture 005 to verify private-tracer isolation.""" + + span_name: str + + +class NodeSpec(_ForbidExtras): + """A single node's directive. + + Exactly one *primary* directive must be set: + + - ``update`` / ``update_pure`` / ``update_pure_from_state`` / + ``update_from_field`` — state-update flavours (the latter is a mock + computation interpreted by the fan-out harness). + - ``raises`` — node raises with the given message; pair with optional + ``error_category``. + - ``subgraph`` — references a top-level ``subgraph``/``subgraphs`` + definition by name. Companions: ``inputs``, ``outputs`` for explicit + mapping (spec v0.2 §2). + - ``fan_out`` — see :class:`FanOutSpec`. + - ``flaky`` and the four ``flaky_*`` variants — harness mocks for + retry/checkpoint behaviours. + - ``calls_llm`` — see :class:`CallsLlmSpec`. + + Companion modifiers (additive, may combine with most primaries): + + - ``emits_log`` — fires a log record with the node's update. + - ``also_emits_via_global_tracer`` — fires a span on the OTel global + provider (used to verify isolation). + - ``middleware`` — per-node middleware list (spec v0.5 §3). + """ + + # Primary directives — exactly one of these must be set. + update: dict[str, Any] | None = None + update_pure: dict[str, Any] | None = None + update_pure_from_state: dict[str, Any] | None = None + update_from_field: UpdateFromFieldSpec | None = None + raises: str | None = None + subgraph: str | None = None + fan_out: FanOutSpec | None = None + flaky: FlakySpec | None = None + flaky_by_index: FlakyByIndexSpec | None = None + flaky_per_index: FlakyPerIndexSpec | None = None + flaky_instance_only: FlakyInstanceOnlySpec | None = None + flaky_resume_aware: FlakyResumeAwareSpec | None = None + calls_llm: CallsLlmSpec | None = None + + # Companions — additive. + inputs: dict[str, str] | None = None + outputs: dict[str, str] | None = None + middleware: list[MiddlewareSpec] | None = None + emits_log: EmitsLogSpec | None = None + also_emits_via_global_tracer: GlobalTracerSpec | None = None + # Pair with ``raises`` to specify the error category (graph-engine §4). + error_category: str | None = None + + _PRIMARY_FIELDS = ( + "update", + "update_pure", + "update_pure_from_state", + "update_from_field", + "raises", + "subgraph", + "fan_out", + "flaky", + "flaky_by_index", + "flaky_per_index", + "flaky_instance_only", + "flaky_resume_aware", + "calls_llm", + ) + + @model_validator(mode="after") + def _exactly_one_primary(self) -> NodeSpec: + set_primaries = [field for field in self._PRIMARY_FIELDS if getattr(self, field) is not None] + if len(set_primaries) == 0: + raise ValueError(f"node has no primary directive (one of: {list(self._PRIMARY_FIELDS)})") + if len(set_primaries) > 1: + raise ValueError(f"node has multiple primary directives: {set_primaries}; exactly one is allowed") + return self + + +# --------------------------------------------------------------------------- +# Middleware +# --------------------------------------------------------------------------- + + +class RetryMiddleware(_AllowExtras): + type: Literal["retry"] + max_attempts: int + classifier: dict[str, Any] | None = None + backoff: dict[str, Any] | None = None + + +class TimingMiddleware(_AllowExtras): + type: Literal["timing"] + # ``on_complete`` shape varies: 014 uses a string label + # (``"capture"``); 012 uses a dict (``{capture_to: timing_records}``) + # to point at a state field. Permissive Any covers both. + on_complete: Any = None + + +class ErrorRecoveryMiddleware(_AllowExtras): + """Test-seam middleware (006): catches exceptions, returns a synthetic + state update instead of re-raising.""" + + type: Literal["error_recovery"] + catch_categories: list[str] | None = None + on_error_update: dict[str, Any] | None = None + + +class ShortCircuitMiddleware(_AllowExtras): + """Test-seam middleware (004): bypasses ``next()`` and returns a + state update directly, never invoking the wrapped node.""" + + type: Literal["short_circuit"] + update: dict[str, Any] | None = None + + +class TraceRecorderMiddleware(_AllowExtras): + """Test-seam middleware (002): records the order of pre/post-node + callbacks for composition-ordering assertions. Carries free-form + parameters for the harness mock — typical fixtures supply + ``name``, ``pre_marker``, ``post_marker`` to label the recorded + entries.""" + + type: Literal["trace_recorder"] + + +MiddlewareSpec = Annotated[ + RetryMiddleware + | TimingMiddleware + | ErrorRecoveryMiddleware + | ShortCircuitMiddleware + | TraceRecorderMiddleware, + Field(discriminator="type"), +] + + +class MiddlewareConfig(_ForbidExtras): + """Top-level ``middleware:`` block — registers middlewares per-graph + and/or per-node. ``per_graph`` wraps every node; ``per_node`` is a map + of node name to per-node list.""" + + per_graph: list[MiddlewareSpec] | None = None + per_node: dict[str, list[MiddlewareSpec]] | None = None + + +# Resolve the forward references on NodeSpec/FanOutSpec. +NodeSpec.model_rebuild() +FanOutSpec.model_rebuild() + + +# --------------------------------------------------------------------------- +# Mock provider / mock LLM responses (llm-provider + observability) +# --------------------------------------------------------------------------- + + +class MockResponse(_AllowExtras): + """One canned response from the harness mock LLM provider. + + Common fields: + + - ``status`` (int) + ``body`` (dict) — successful HTTP response. + - ``raises_category`` (str) + ``cause`` — error-categories fixtures. + - ``connection_failure`` (bool) — network failure simulation + (llm-provider/004 connection_failure case). + + Permissive shape because the body's content mirrors OpenAI's wire + format which is wide and evolving; modelling every field would + duplicate the OpenAI schema. The ``llm-provider`` capability's + spec.md §8 is the authoritative shape. + """ + + status: int | None = None + body: dict[str, Any] | None = None + raises_category: str | None = None + cause: dict[str, Any] | None = None + connection_failure: bool | None = None + + +class MockProviderConfig(_AllowExtras): + """``mock_provider:`` block. + + - ``responses`` is consumed in order: each ``complete()`` call pops + the next entry. + - ``health_endpoint`` configures the mock for ``ready()`` checks + (llm-provider/007); the harness exposes a separate health-probe + response distinct from the ``responses`` queue. + + Permissive shape — fixture-specific config knobs (e.g. retry + intervals, simulated latencies) may appear without breaking parse. + """ + + responses: list[MockResponse] | None = None + health_endpoint: dict[str, Any] | None = None + + +# --------------------------------------------------------------------------- +# Observers (graph-engine §6) +# --------------------------------------------------------------------------- + + +class ObserverSpec(_ForbidExtras): + """One observer registration. + + - ``attach`` is ``graph`` (graph-attached, persists across invocations) + or ``invocation`` (passed to one ``invoke`` call). + - ``target`` is ``outer`` (outermost graph) or a subgraph name. + - ``behavior`` is ``record`` (capture events for assertion) or + ``raise`` (raise to verify error isolation). + - ``phases`` (optional, spec v0.6 §6) — subset of ``{"started", + "completed"}`` for per-observer phase subscription. + """ + + name: str + attach: Literal["graph", "invocation"] + target: str + behavior: Literal["record", "raise"] + phases: list[Literal["started", "completed"]] | None = None + + +# --------------------------------------------------------------------------- +# LlmProviderFixture's `calls` +# --------------------------------------------------------------------------- + + +class LlmCallSpec(_AllowExtras): + """One call against the mock provider. + + ``operation`` is ``complete`` (with ``messages`` + optional ``tools``) + or ``ready`` (no inputs). Other call params (temperature, max_tokens, + top_p, seed, etc.) may appear and are passed through to the + underlying provider call. ``expected`` is checked against the result. + """ + + operation: Literal["complete", "ready"] + messages: list[dict[str, Any]] | None = None + tools: list[dict[str, Any]] | None = None + # Optional — when missing, the case-level ``expected:`` carries + # the assertion (the per-call vs per-case split). + expected: dict[str, Any] | None = None + + +__all__ = [ + "CallsLlmSpec", + "EdgeSpec", + "EmitsLogSpec", + "ErrorRecoveryMiddleware", + "FailureSpec", + "FanOutSpec", + "FlakyByIndexSpec", + "FlakyInstanceOnlySpec", + "FlakyPerIndexSpec", + "FlakyResumeAwareSpec", + "FlakySpec", + "GlobalTracerSpec", + "LlmCallSpec", + "MiddlewareConfig", + "MiddlewareSpec", + "MockProviderConfig", + "MockResponse", + "NodeSpec", + "ObserverSpec", + "RetryMiddleware", + "ShortCircuitMiddleware", + "StateFieldSpec", + "StateSchema", + "TimingMiddleware", + "TraceRecorderMiddleware", + "UpdateFromFieldSpec", +] diff --git a/tests/conformance/harness/expectations.py b/tests/conformance/harness/expectations.py new file mode 100644 index 0000000..dddfbbc --- /dev/null +++ b/tests/conformance/harness/expectations.py @@ -0,0 +1,257 @@ +"""Typed ``expected:`` block models — per-capability shapes for the +fixture's assertion payload. + +The four capabilities have non-overlapping expected shapes (an +observability fixture wouldn't have ``checkpoint_saves``; a graph-engine +fixture wouldn't have ``span_tree``). Modelling each cleanly catches +fixture authors mixing keys across capabilities, and gives runtime code +in :mod:`runtime` typed access to the assertion payload it needs. + +Phase 0 typing depth: TOP-LEVEL keys per capability are exhaustively +typed (catches new directives the spec adds). The nested payload values +underneath (e.g., individual span tree entries, observer event details) +are kept loose as ``list[Any]`` / ``dict[str, Any]`` because the runtime +phases that consume them are the right place to tighten — Phase 1 +will type observer-event entries, Phase 5 will type span_tree, etc. +""" + +from __future__ import annotations + +from typing import Annotated, Any, Literal, cast + +from pydantic import BaseModel, ConfigDict, Discriminator, Tag + + +class _ForbidExtras(BaseModel): + model_config = ConfigDict(extra="forbid") + + +# --------------------------------------------------------------------------- +# graph-engine expected block +# --------------------------------------------------------------------------- + + +class GraphEngineExpected(_ForbidExtras): + """Expected block for graph-engine fixtures (001–018). + + Top-level keys union'd across every fixture in + ``spec/graph-engine/conformance/`` at v0.8.0. + """ + + final_state: dict[str, Any] | None = None + execution_order: list[str] | None = None + expected_error: dict[str, Any] | None = None + # Two shapes seen in fixtures: + # - dict[observer_name, list[event_dict]] — most fixtures + # - list[event_dict] flat — pipeline-utilities/011 (single-observer) + # Permissive ``Any`` until Phase 1 (engine retrofit) tightens. + observer_events: Any = None + delivery_order: list[dict[str, Any]] | None = None + observer_event_invariants: dict[str, Any] | None = None + # 015 — invoke() returns normally; obs_raiser's exceptions surface to + # warnings rather than propagate. + no_propagated_error: bool | None = None + # 018 — registering an observer with `phases: []` raises at + # registration time per spec §6. + empty_phases_raises_at_registration: bool | None = None + + +# --------------------------------------------------------------------------- +# llm-provider expected block +# --------------------------------------------------------------------------- + + +class LlmProviderResponseAssertion(_ForbidExtras): + """Assertion payload for a successful ``complete()`` call.""" + + message: dict[str, Any] | None = None + finish_reason: str | None = None + usage: dict[str, Any] | None = None + raw_check: dict[str, Any] | None = None + + +class LlmProviderRaisesAssertion(BaseModel): + """Assertion payload for a call that's expected to raise. + + Permissive — fixtures attach assertion-specific knobs like + ``retry_after_seconds`` (rate-limit fixture) without restructuring + the type. The runtime in Phase 2 validates the keys it reads. + """ + + model_config = ConfigDict(extra="allow") + + category: str + message: str | None = None + cause: dict[str, Any] | None = None + + +class LlmProviderExpected(_ForbidExtras): + """Expected block for llm-provider fixtures. + + A call's ``expected:`` carries ``response`` (success path), + ``raises`` (error path), or ``success`` (boolean for the ``ready`` + operation). Mutually exclusive in practice. + """ + + response: LlmProviderResponseAssertion | None = None + raises: LlmProviderRaisesAssertion | None = None + success: bool | None = None + + +# --------------------------------------------------------------------------- +# pipeline-utilities expected block +# --------------------------------------------------------------------------- + + +class PipelineUtilitiesExpected(_ForbidExtras): + """Expected block for pipeline-utilities fixtures (001–031). + + Spans middleware, fan-out, and checkpointing; the union is wide. + """ + + final_state: dict[str, Any] | None = None + execution_order: list[str] | None = None + expected_error: dict[str, Any] | None = None + # Two shapes seen in fixtures: + # - dict[observer_name, list[event_dict]] — most fixtures + # - list[event_dict] flat — pipeline-utilities/011 (single-observer) + # Permissive ``Any`` until Phase 1 (engine retrofit) tightens. + observer_events: Any = None + observer_event_invariants: dict[str, Any] | None = None + # Singular form used by 015 — assert one specific event shape. + expected_observer_event: dict[str, Any] | None = None + # Checkpointing fixtures (024–031). + checkpoint_saves: list[dict[str, Any]] | None = None + latest_record_assertions: dict[str, Any] | None = None + invariants: dict[str, Any] | None = None + # Fan-out fixtures (017–023). + concurrency_invariant: dict[str, Any] | None = None + # Timing middleware fixtures (012–014). + timing_records: list[dict[str, Any]] | None = None + # Trace recorder middleware fixtures (001–003). Two shapes: + # - dict[recorder_name, list[record]] when multiple recorders (001). + # - list[record] flat when a single recorder. + trace_records: Any = None + + +# --------------------------------------------------------------------------- +# observability expected block +# --------------------------------------------------------------------------- + + +class ObservabilityExpected(_ForbidExtras): + """Expected block for observability fixtures (001–011). + + Span trees come in three flavours depending on what the fixture + verifies: + + - ``span_tree`` — single-trace, single-exporter (the common case). + - ``span_tree_private`` / ``span_tree_global`` — dual-exporter + isolation case (fixture 005 + the global-tracer companion). + - ``traces`` — multi-trace case (detached subgraphs/fan-outs in + fixture 008 produce multiple root traces). + """ + + span_tree: list[dict[str, Any]] | None = None + span_tree_private: list[dict[str, Any]] | None = None + span_tree_global: list[dict[str, Any]] | None = None + traces: list[dict[str, Any]] | None = None + parent_trace: dict[str, Any] | None = None + detached_trace_count: int | None = None + # Logs Bridge (fixture 010). + log_records: list[dict[str, Any]] | None = None + # Negative assertions used across 005, 008, 010, 011. + no_global_provider_spans: bool | None = None + no_openarmature_spans_on_global: bool | None = None + no_edge_spans: bool | None = None + no_llm_provider_span: bool | None = None + # Invariants block (fixture 011 determinism). + invariants: dict[str, Any] | None = None + determinism_check: dict[str, Any] | None = None + # Multi-invocation fixtures (009 cross-cutting, 011 determinism). + invocation_count: int | None = None + + +# --------------------------------------------------------------------------- +# Discriminated union — pick by which capability-specific keys appear +# --------------------------------------------------------------------------- + + +_GRAPH_ENGINE_KEYS = frozenset( + { + "no_propagated_error", + "empty_phases_raises_at_registration", + } +) +_LLM_PROVIDER_KEYS = frozenset({"response", "raises", "success"}) +_PIPELINE_UTILITIES_KEYS = frozenset( + { + "checkpoint_saves", + "latest_record_assertions", + "concurrency_invariant", + "timing_records", + "trace_records", + "expected_observer_event", + } +) +_OBSERVABILITY_KEYS = frozenset( + { + "span_tree", + "span_tree_private", + "span_tree_global", + "traces", + "parent_trace", + "detached_trace_count", + "log_records", + "no_global_provider_spans", + "no_openarmature_spans_on_global", + "no_edge_spans", + "no_llm_provider_span", + "determinism_check", + "invocation_count", + } +) + + +def _discriminate_expected( + value: Any, +) -> Literal["graph_engine", "llm_provider", "pipeline_utilities", "observability"]: + """Pick the per-capability expected shape from the dict's keys. + + Capability-specific keys take priority. For shape-overlap (e.g. + ``final_state`` is in both graph-engine and pipeline-utilities), the + fixture's location on disk is the authoritative tag — but expected + blocks themselves don't know that, so we discriminate on the keys + that ARE distinctive and fall back to graph-engine for plain + ``final_state``-only fixtures. + """ + if not isinstance(value, dict): + return "graph_engine" + keys: set[str] = {str(k) for k in cast("dict[str, Any]", value)} + if keys & _LLM_PROVIDER_KEYS and not keys & _GRAPH_ENGINE_KEYS: + return "llm_provider" + if keys & _OBSERVABILITY_KEYS: + return "observability" + if keys & _PIPELINE_UTILITIES_KEYS: + return "pipeline_utilities" + return "graph_engine" + + +ExpectedBlock = Annotated[ + Annotated[GraphEngineExpected, Tag("graph_engine")] + | Annotated[LlmProviderExpected, Tag("llm_provider")] + | Annotated[PipelineUtilitiesExpected, Tag("pipeline_utilities")] + | Annotated[ObservabilityExpected, Tag("observability")], + Discriminator(_discriminate_expected), +] + + +__all__ = [ + "ExpectedBlock", + "GraphEngineExpected", + "LlmProviderExpected", + "LlmProviderRaisesAssertion", + "LlmProviderResponseAssertion", + "ObservabilityExpected", + "PipelineUtilitiesExpected", +] diff --git a/tests/conformance/harness/fixtures.py b/tests/conformance/harness/fixtures.py new file mode 100644 index 0000000..d53212f --- /dev/null +++ b/tests/conformance/harness/fixtures.py @@ -0,0 +1,269 @@ +"""Typed fixture root models. + +Per the Phase 0 plan: every YAML fixture under +``openarmature-spec/spec//conformance/`` lands as one of three +typed shapes. The shape is chosen by a callable discriminator that inspects +the raw dict's top-level keys (no tag field is present in the YAML). + +The three shapes: + +- :class:`LlmProviderFixture` — ``mock_provider`` is at the top level. Tests + the stateless ``complete()`` / ``ready()`` operations of the + ``llm-provider`` capability against canned wire responses. May contain + ``cases:`` for table-style sub-cases that share the mock provider. + +- :class:`CasesFixture` — top-level ``cases:`` list (and no + ``mock_provider``). Each case carries its own graph definition and + expected block. Optional shared ``subgraph`` / ``subgraph_with_idx`` + blocks at the top level apply across cases. + +- :class:`GraphFixture` — direct graph at the top level (state + entry + + nodes + edges + initial_state + expected). Optional ``run_count`` for + determinism fixtures, plus a long tail of optional harness directives + (``observers``, ``middleware``, ``caller_correlation_id``, + ``detached_subgraphs``, etc.). + +Sub-shapes (state field schemas, node directives, edge specs, middleware +specs, observer specs, expected blocks) live in :mod:`directives` and +:mod:`expectations`. The split is for readability; what's authoritative is +the union of all three shapes here parsing every fixture in the spec +submodule with ``extra="forbid"`` rejecting unknown keys at every level. +""" + +from __future__ import annotations + +from typing import Annotated, Any, Literal + +from pydantic import BaseModel, ConfigDict, Discriminator, Tag + +from .directives import ( + EdgeSpec, + LlmCallSpec, + MiddlewareConfig, + MockProviderConfig, + MockResponse, + NodeSpec, + ObserverSpec, + StateSchema, +) +from .expectations import ExpectedBlock, LlmProviderExpected + + +class _ForbidExtras(BaseModel): + """Common base — strict by default. Catches both fixture authors and us + drifting from the spec; new directives surface as parse errors at the + point they're introduced rather than getting silently dropped.""" + + model_config = ConfigDict(extra="forbid") + + +# --------------------------------------------------------------------------- +# Shared sub-shapes +# --------------------------------------------------------------------------- + + +class SubgraphDefinition(BaseModel): + """A subgraph at the fixture's top level (singular ``subgraph:`` form + or one entry of the plural ``subgraphs:`` map). Carries its own state + schema, nodes, and edges — structurally a mini-graph. Permissive + extras to absorb subgraph-local middleware blocks (pipeline-utilities/ + 020) and any future extension.""" + + model_config = ConfigDict(extra="allow") + + name: str | None = None # singular `subgraph:` form + state: StateSchema + entry: str + nodes: dict[str, NodeSpec] + edges: list[EdgeSpec] + middleware: MiddlewareConfig | None = None + + +class CaseSpec(BaseModel): + """One sub-case in a ``CasesFixture`` (or in the ``cases:`` block of an + LlmProviderFixture). + + The shape of a case is fluid — checkpointing fixtures (027–031) bring + in ``checkpointer``/``first_run_expected_error``/``saved_record_assertions``/ + ``resume`` blocks; llm-provider cases bring in ``call`` / + ``expected_wire_request``; graph-engine ``007-compile-errors`` cases + have ``graph:`` wrapping the graph + ``expected_compile_error``; + observability cases inherit any harness directive a top-level + ``GraphFixture`` could carry. Permissive extras so the parse keeps + pace with case-shape evolution without quarterly model edits. + """ + + model_config = ConfigDict(extra="allow") + + name: str + description: str | None = None + # graph-engine 007 compile-errors: a case wraps the malformed graph + # under a `graph:` key alongside `expected_compile_error`. + graph: dict[str, Any] | None = None + expected_compile_error: str | None = None + # The graph-shaped fields when a case carries the graph inline (rather + # than under ``graph:``). + state: StateSchema | None = None + entry: str | None = None + nodes: dict[str, NodeSpec] | None = None + edges: list[EdgeSpec] | None = None + initial_state: dict[str, Any] | None = None + subgraph: SubgraphDefinition | None = None + subgraphs: dict[str, SubgraphDefinition] | None = None + middleware: MiddlewareConfig | None = None + observers: list[ObserverSpec] | None = None + expected: ExpectedBlock | None = None + expected_error: dict[str, Any] | None = None + # llm-provider sub-cases. + call: LlmCallSpec | None = None + expected_wire_request: dict[str, Any] | None = None + # Checkpointing fixtures (024–031). + checkpointer: str | None = None + first_run_expected_error: dict[str, Any] | None = None + saved_record_assertions: dict[str, Any] | None = None + latest_record_assertions: dict[str, Any] | None = None + resume: dict[str, Any] | None = None + invariants: dict[str, Any] | None = None + # Either an int (run count) or a list of run configs — fixtures vary. + populate_checkpointer_via_runs: Any = None + invoke_with: dict[str, Any] | None = None + caller_correlation_id: str | None = None + # observability — mock LLM responses + per-case run config. + mock_llm: list[MockResponse] | None = None + invocations: int | None = None + + +# --------------------------------------------------------------------------- +# LlmProviderFixture +# --------------------------------------------------------------------------- + + +class LlmProviderFixture(_ForbidExtras): + """A fixture under ``spec/llm-provider/conformance/``. + + Either ``calls`` is at the top level (single-case) or wrapped in + ``cases`` (table-style). ``mock_provider`` is always present and + discriminates this shape from the graph-shaped fixtures. + """ + + mock_provider: MockProviderConfig + calls: list[LlmCallSpec] | None = None + cases: list[CaseSpec] | None = None + + +# --------------------------------------------------------------------------- +# CasesFixture +# --------------------------------------------------------------------------- + + +class CasesFixture(_ForbidExtras): + """A fixture whose top level is ``cases:`` rather than a single graph. + + Used by ``007-compile-errors``, the checkpointing fixtures (024–031), + and the determinism / multi-run observability fixtures. Optional shared + ``subgraph`` / ``subgraph_with_idx`` at the top level apply across all + cases. Any other top-level key not listed here is rejected. + """ + + cases: list[CaseSpec] + # Shared graph-shape blocks that apply across every case. Empirically + # only `subgraph` and `subgraph_with_idx` appear at the top level of + # cases-fixtures; the plural `subgraphs` form has not been seen at + # the cases-fixture top level. + subgraph: SubgraphDefinition | None = None + subgraph_with_idx: SubgraphDefinition | None = None + + +# --------------------------------------------------------------------------- +# GraphFixture +# --------------------------------------------------------------------------- + + +class GraphFixture(_ForbidExtras): + """A fixture whose top level IS a single graph. + + Covers the bulk of graph-engine, pipeline-utilities, and observability + fixtures. Most fields are optional because different fixtures exercise + different facets of the graph contract. + """ + + # Graph definition (graph-engine + most others). + state: StateSchema + entry: str | None = None + nodes: dict[str, NodeSpec] | None = None + edges: list[EdgeSpec] | None = None + initial_state: dict[str, Any] | None = None + expected: ExpectedBlock | None = None + + # Legacy: top-level expected_error in graph-engine fixtures 008/009. + expected_error: dict[str, Any] | None = None + + # Subgraph definitions — singular form for graph-engine; plural map for + # the multi-subgraph cases in observability/008, observability/010, and + # pipeline-utilities/029. + subgraph: SubgraphDefinition | None = None + subgraphs: dict[str, SubgraphDefinition] | None = None + # Used by pipeline-utilities/020 (fan-out instances expose their idx). + subgraph_with_idx: SubgraphDefinition | None = None + + # graph-engine §6 observers (since proposal 0003). + observers: list[ObserverSpec] | None = None + + # pipeline-utilities §6 middleware (proposal 0004) and §10 checkpointer + # registration (proposal 0008). + middleware: MiddlewareConfig | None = None + checkpointer: str | None = None + clock_stub: dict[str, Any] | None = None + + # Determinism fixtures — graph-engine/010 and pipeline-utilities/011. + run_count: int | None = None + + # observability / pipeline-utilities cross-cutting harness directives. + # These are inputs to the test harness, NOT the engine. + caller_correlation_id: str | None = None + detached_subgraphs: list[str] | None = None + detached_fan_outs: list[str] | None = None + disable_llm_spans: bool | None = None + mock_llm: list[MockResponse] | None = None + caller_global_otel_active: bool | None = None + invocations: int | None = None + + +# --------------------------------------------------------------------------- +# Discriminator + root union +# --------------------------------------------------------------------------- + + +def _discriminate_fixture(value: Any) -> Literal["llm_provider", "cases", "graph"]: + """Pick the fixture shape from a raw YAML dict. + + Order matters: ``mock_provider`` wins over ``cases`` because some + llm-provider fixtures (e.g. 003-message-validation) have BOTH — + ``mock_provider`` is the load-bearing discriminator, ``cases`` is just + the table style for sub-cases. + """ + if isinstance(value, dict): + if "mock_provider" in value: + return "llm_provider" + if "cases" in value: + return "cases" + return "graph" + + +Fixture = Annotated[ + Annotated[LlmProviderFixture, Tag("llm_provider")] + | Annotated[CasesFixture, Tag("cases")] + | Annotated[GraphFixture, Tag("graph")], + Discriminator(_discriminate_fixture), +] + + +__all__ = [ + "CaseSpec", + "CasesFixture", + "Fixture", + "GraphFixture", + "LlmProviderExpected", + "LlmProviderFixture", + "SubgraphDefinition", +] diff --git a/tests/conformance/harness/loader.py b/tests/conformance/harness/loader.py new file mode 100644 index 0000000..271e561 --- /dev/null +++ b/tests/conformance/harness/loader.py @@ -0,0 +1,74 @@ +"""Discovery + parsing for spec conformance fixtures. + +Two entry points: + +- :func:`discover_fixtures` walks the four capability directories under the + pinned ``openarmature-spec`` submodule and yields ``(capability, path)`` + pairs sorted by capability then filename. Used by parametrized pytest + collection. + +- :func:`load_fixture` parses one YAML file into a typed + :data:`fixtures.Fixture` (one of the three discriminated variants). +""" + +from __future__ import annotations + +from collections.abc import Iterator +from pathlib import Path + +import yaml +from pydantic import TypeAdapter + +from .fixtures import Fixture + +# All four capability directories the spec defines under ``spec/``. Keep this +# list in sync with the spec repo's top-level ``spec//`` +# layout. Adding a fifth capability is a "knob" change — the discovery and +# parsing already work; you'd only need to extend the per-capability +# expected-block models in :mod:`expectations`. +CAPABILITIES: tuple[str, ...] = ( + "graph-engine", + "llm-provider", + "pipeline-utilities", + "observability", +) + +CONFORMANCE_ROOT = Path(__file__).resolve().parents[3] / "openarmature-spec" / "spec" + +# pydantic v2 needs an adapter to validate against an Annotated/Union type +# that isn't itself a BaseModel subclass. Built once, reused per call. +_FIXTURE_ADAPTER: TypeAdapter[Fixture] = TypeAdapter(Fixture) + + +def discover_fixtures() -> Iterator[tuple[str, Path]]: + """Yield ``(capability, fixture_path)`` for every ``NNN-*.yaml`` under + each capability's ``conformance/`` directory, sorted deterministically + so pytest parametrization IDs are stable across runs. + """ + for capability in CAPABILITIES: + conformance_dir = CONFORMANCE_ROOT / capability / "conformance" + if not conformance_dir.is_dir(): + continue + for path in sorted(conformance_dir.glob("[0-9][0-9][0-9]-*.yaml")): + yield capability, path + + +def load_fixture(path: Path) -> Fixture: + """Parse a fixture YAML into one of the three typed variants. + + The discriminator inspects top-level keys to pick + :class:`LlmProviderFixture` (when ``mock_provider`` is present), + :class:`CasesFixture` (when ``cases`` is present and no + ``mock_provider``), or :class:`GraphFixture` (default). + + Raises ``pydantic.ValidationError`` on schema violations — the + ``extra="forbid"`` config in :mod:`fixtures` makes any unknown + top-level key fail loudly, which is how we catch the spec adding + directives we haven't modelled yet. + """ + with path.open() as f: + raw = yaml.safe_load(f) + return _FIXTURE_ADAPTER.validate_python(raw) + + +__all__ = ["CAPABILITIES", "CONFORMANCE_ROOT", "discover_fixtures", "load_fixture"] diff --git a/tests/conformance/harness/runtime/README.md b/tests/conformance/harness/runtime/README.md new file mode 100644 index 0000000..c4eecc5 --- /dev/null +++ b/tests/conformance/harness/runtime/README.md @@ -0,0 +1,10 @@ +# `harness/runtime/` + +Phase 0 (typed parser) lives in the parent `harness/` package; this directory +is the home for the **runtime** — the code that takes a parsed fixture and +actually executes it against the engine. Implementations land here in +Phases 1–6, one capability or directive at a time. + +Phase 0 deliberately ships an empty `runtime/` to lock in the boundary: +parsing is fixed (every fixture lands as a typed config validated once), and +phases that follow only add interpretation, never re-touch the parsing. diff --git a/tests/conformance/harness/runtime/__init__.py b/tests/conformance/harness/runtime/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/conformance/harness/skip.py b/tests/conformance/harness/skip.py new file mode 100644 index 0000000..4596710 --- /dev/null +++ b/tests/conformance/harness/skip.py @@ -0,0 +1,111 @@ +"""Structured skip-reason values for fixtures whose directives the current +phase doesn't yet support. + +Phase 0 ships parsing only — every Phase 1+ runtime test that consumes a +fixture marks itself skipped if the fixture references directives not yet +implemented. The skip-reason is structured (capability + directive list + +phase mapping) so test output makes the next-phase pickup obvious rather +than asking the reader to grep the implementation plan. +""" + +from __future__ import annotations + +from dataclasses import dataclass + +# Mapping from spec directive names → the phase that lands their runtime +# implementation. Sourced from the implementation-plan agreement (see +# `_docs/implementation-plan.md` if/when that lands; for now the phase +# numbers are: 1 = engine pair model, 2 = llm-provider, 3 = middleware, +# 4 = fan-out, 5 = OTel observability, 6 = checkpointing). +DIRECTIVE_PHASE: dict[str, int] = { + # Phase 1 — graph-engine (proposals 0001 + 0002 + 0003 + 0005's §6 revision) + "observers": 1, + "phases_subscription": 1, + "phase": 1, + "fan_out_index": 1, + "attempt_index": 1, + # Phase 2 — llm-provider (proposal 0006) + "mock_provider": 2, + "calls_llm": 2, + "expected_wire_request": 2, + # Phase 3 — pipeline-utilities middleware (proposal 0004) + "middleware": 3, + "flaky": 3, + "clock_stub": 3, + # Phase 4 — pipeline-utilities fan-out (proposal 0005) + "fan_out": 4, + "flaky_by_index": 4, + "flaky_instance_only": 4, + "subgraph_with_idx": 4, + # Phase 5 — observability (proposal 0007) + "caller_correlation_id": 5, + "detached_subgraphs": 5, + "detached_fan_outs": 5, + "disable_llm_spans": 5, + "mock_llm": 5, + "caller_global_otel_active": 5, + "invocations": 5, + "emits_log": 5, + "also_emits_via_global_tracer": 5, + # Phase 6 — checkpointing (proposal 0008) + "checkpointer": 6, + "first_run_expected_error": 6, + "saved_record_assertions": 6, + "resume": 6, + "populate_checkpointer_via_runs": 6, + "flaky_per_index": 6, + "flaky_resume_aware": 6, + "update_pure_from_state": 6, +} + +PHASE_TITLE: dict[int, str] = { + 1: "engine pair-model + fan-out scaffolding", + 2: "llm-provider", + 3: "pipeline-utilities middleware", + 4: "pipeline-utilities fan-out", + 5: "observability (OTel)", + 6: "checkpointing", +} + + +@dataclass(frozen=True) +class SkipReason: + """Why a runtime test is skipped at the current phase. + + Render via :meth:`format` for pytest skip-message output. The + rendered string is action-readable ("phase 1 hasn't shipped yet, + look at directives X, Y") rather than generic ("not implemented"). + """ + + fixture: str # capability/path-relative identifier, e.g. "graph-engine/012-..." + current_phase: int + missing_directives: tuple[str, ...] + + def format(self) -> str: + """Render the skip message shown in pytest's `-v` output.""" + if not self.missing_directives: + return f"{self.fixture}: nothing to skip on (current phase {self.current_phase})" + # Group directives by their landing phase so the message reads + # "lands in phase 4 (fan-out): [fan_out, flaky_by_index]". + by_phase: dict[int, list[str]] = {} + unknown: list[str] = [] + for directive in self.missing_directives: + phase = DIRECTIVE_PHASE.get(directive) + if phase is None: + unknown.append(directive) + else: + by_phase.setdefault(phase, []).append(directive) + + parts = [ + f"phase {phase} ({PHASE_TITLE[phase]}): {sorted(directives)}" + for phase, directives in sorted(by_phase.items()) + ] + if unknown: + parts.append(f"unmapped: {sorted(unknown)}") + return ( + f"{self.fixture}: needs directives not yet supported at phase " + f"{self.current_phase} — {'; '.join(parts)}" + ) + + +__all__ = ["DIRECTIVE_PHASE", "PHASE_TITLE", "SkipReason"] diff --git a/tests/conformance/test_conformance.py b/tests/conformance/test_conformance.py index 76b429f..aebfa24 100644 --- a/tests/conformance/test_conformance.py +++ b/tests/conformance/test_conformance.py @@ -8,7 +8,7 @@ from __future__ import annotations from pathlib import Path -from typing import Any +from typing import Any, cast import pytest import yaml @@ -56,10 +56,90 @@ def _fixture_id(path: Path) -> str: ] +def _needs_pair_model(spec: dict[str, Any]) -> bool: + """True if the fixture's expected observer events use the v0.6.0 pair + model (`phase: started/completed`). The current engine emits the + pre-v0.6.0 single-event model; Phase 1 retrofits the pair model. + """ + expected = spec.get("expected") + if not isinstance(expected, dict): + return False + events_by_name = cast("dict[str, Any]", expected).get("observer_events") + if not isinstance(events_by_name, dict): + return False + for events in cast("dict[str, Any]", events_by_name).values(): + if not isinstance(events, list): + continue + for event in cast("list[Any]", events): + if isinstance(event, dict) and "phase" in event: + return True + return False + + +# Node directives the legacy adapter doesn't (yet) translate. Phase 1+ will +# either expand the adapter or replace it with the typed harness. +_UNSUPPORTED_NODE_DIRECTIVES = frozenset( + { + "fan_out", + "flaky", + "flaky_by_index", + "flaky_per_index", + "flaky_instance_only", + "flaky_resume_aware", + "calls_llm", + "update_pure", + "update_pure_from_state", + "update_from_field", + "emits_log", + "also_emits_via_global_tracer", + } +) + + +def _unsupported_directive(spec: dict[str, Any]) -> str | None: + """Return the first node directive the legacy adapter can't translate, + or None if every node uses one of the directives it handles. Walks + both the top-level graph and an optional inner ``subgraph`` block.""" + + def scan(graph: Any) -> str | None: + if not isinstance(graph, dict): + return None + nodes = cast("dict[str, Any]", graph).get("nodes") + if not isinstance(nodes, dict): + return None + for node_name, node_spec in cast("dict[str, Any]", nodes).items(): + if not isinstance(node_spec, dict): + continue + for key in cast("dict[str, Any]", node_spec): + if key in _UNSUPPORTED_NODE_DIRECTIVES: + return f"{node_name}.{key}" + return None + + if (hit := scan(spec)) is not None: + return hit + if (hit := scan(spec.get("subgraph"))) is not None: + return hit + return None + + @pytest.mark.parametrize("fixture_path", _STANDARD_RUNTIME_FIXTURES, ids=_fixture_id) async def test_runtime_fixture(fixture_path: Path) -> None: spec = _load(fixture_path) + # Phase 0 — skip fixtures whose expected observer events use the v0.6.0 + # started/completed pair model. Phase 1 (engine retrofit) lands the + # pair model and turns these back on. + if _needs_pair_model(spec): + pytest.skip( + f"{fixture_path.stem}: needs phase 1 (engine pair-model retrofit) " + "— expected observer events carry `phase` field" + ) + # Phase 0 — skip fixtures whose nodes use directives the legacy adapter + # doesn't translate (fan_out, flaky variants, calls_llm, etc.). Each + # directive is gated to the phase that lands its runtime support. + if (hit := _unsupported_directive(spec)) is not None: + pytest.skip(f"{fixture_path.stem}: unsupported node directive {hit}") + # Subgraph fixtures (006, 011, 013) declare an inner subgraph that the # outer graph references by name. subgraphs: dict[str, Any] = {} diff --git a/tests/conformance/test_fixture_parsing.py b/tests/conformance/test_fixture_parsing.py new file mode 100644 index 0000000..a949f9c --- /dev/null +++ b/tests/conformance/test_fixture_parsing.py @@ -0,0 +1,57 @@ +"""Phase 0 exit criterion: every fixture in the spec submodule parses into a +typed harness config, AND the parse is round-trip stable (parse → dump → +parse produces an equal model). + +Round-trip stability catches the bug class where a directive lands in the +spec but our pydantic model silently drops it via ``extra="forbid"`` not +being applied (or, conversely, where a field is mistakenly typed loose +enough to accept a dict that doesn't actually round-trip cleanly). +""" + +from __future__ import annotations + +from pathlib import Path + +import pytest + +from .harness import discover_fixtures, load_fixture + + +def _id(case: tuple[str, Path]) -> str: + capability, path = case + return f"{capability}/{path.stem}" + + +_FIXTURES = list(discover_fixtures()) + + +def test_inventory_is_non_empty() -> None: + """Sanity guard. The spec submodule should expose 68+ fixtures across + the four capabilities. If discover returns zero, the submodule pin is + wrong or the directory layout changed.""" + assert len(_FIXTURES) > 0, "no conformance fixtures discovered" + + +@pytest.mark.parametrize("case", _FIXTURES, ids=_id) +def test_fixture_parses(case: tuple[str, Path]) -> None: + """Every fixture parses into one of the three typed variants. The + discriminator routes to ``LlmProviderFixture``, ``CasesFixture``, or + ``GraphFixture`` based on top-level keys; ``extra="forbid"`` rejects + any unknown top-level field.""" + _, path = case + load_fixture(path) + + +@pytest.mark.parametrize("case", _FIXTURES, ids=_id) +def test_fixture_round_trips(case: tuple[str, Path]) -> None: + """Parse → ``model_dump`` → re-parse → equal. Exit criterion for + Phase 0 per the implementation plan: catches dropped fields the user + intended to use later.""" + _, path = case + parsed = load_fixture(path) + dumped = parsed.model_dump(exclude_none=True) + # Re-parse via the same loader path so the discriminator runs again. + from .harness.loader import _FIXTURE_ADAPTER + + reparsed = _FIXTURE_ADAPTER.validate_python(dumped) + assert parsed == reparsed, f"round-trip mismatch for {path}"