chore: src/ README audit gate + 5 package READMEs (#126, #152) (#71)

constk · web-flow · commit 843c2537fa2f · 2026-04-30T20:24:41.000+10:00
diff --git a/.github/branch-protection/develop.json b/.github/branch-protection/develop.json
@@ -12,6 +12,7 @@
       "Version bump check",
       "Action pinning audit",
       "Tests required",
+      "src/ README audit",
       "Frontend Build",
       "Frontend Quality",
       "Branch-protection contexts sync",
diff --git a/.github/branch-protection/main.json b/.github/branch-protection/main.json
@@ -12,6 +12,7 @@
       "Version bump check",
       "Action pinning audit",
       "Tests required",
+      "src/ README audit",
       "Frontend Build",
       "Frontend Quality",
       "Branch-protection contexts sync",
diff --git a/.github/scripts/check_src_readmes.py b/.github/scripts/check_src_readmes.py
@@ -0,0 +1,142 @@
+#!/usr/bin/env python3
+"""Enforce the per-package README rule from `CLAUDE.md`.
+
+`CLAUDE.md` *Code standards*: *"Each `src/` directory has a README
+explaining its purpose and key interfaces."* As `src/` grows, READMEs
+go missing silently — and even when one exists, it can degrade into
+unstructured prose that no contributor reads. This script audits both
+shape and substance.
+
+Behaviour:
+
+- Walks every subdirectory under `src/` (recursive, skipping
+  `__pycache__`).
+- A subdirectory must have a `README.md` when it contains at least one
+  `.py` file other than `__init__.py`. Empty directories and
+  `__init__.py`-only namespace packages are exempt — they have no
+  surface to document.
+- The README must be non-trivial: at least `MIN_BYTES = 200` bytes
+  after stripping leading/trailing whitespace. Catches the empty-stub
+  failure mode (#126).
+- The README must contain a `## Key interfaces` heading (or its
+  documented synonym `## Public surface`). #152 promoted the gate
+  from presence + size to presence + structure: a 200-byte unrelated
+  paragraph passed pre-#152, but doesn't actually document the
+  package's public surface. The Purpose statement is encoded
+  positionally as the H1 heading + immediately-following paragraph
+  rather than as an explicit `## Purpose` heading — that's the
+  existing convention across all seven packages and the natural
+  markdown shape; a separate `## Purpose` heading would duplicate
+  what the H1 already establishes.
+
+There is **no exemption mechanism**, mirroring `check_file_length.py`
+(see `feedback_no_noqa`). If a future package legitimately needs a
+shorter README (e.g. a single-file helper with a self-explanatory
+filename), restructure rather than carve out an allowlist entry.
+
+Exit codes:
+  0 — every package with code has a non-trivial, structured README
+  1 — at least one package is missing a README, has a stub one, or
+      lacks the required heading
+  2 — `src/` does not exist (run from the wrong directory?)
+
+Usage (from repo root):
+
+    python .github/scripts/check_src_readmes.py
+"""
+
+from __future__ import annotations
+
+import re
+import sys
+from pathlib import Path
+
+SRC_ROOT = Path("src")
+MIN_BYTES = 200
+
+# Required heading shapes. Each tuple is "any-of" — a README satisfies the
+# rule if it contains at least one matching heading. Match is anchored to
+# line start, case-insensitive, allows `#` levels 2-4 so a deeply nested
+# subsection still counts.
+KEY_INTERFACES_HEADING = re.compile(
+    r"^#{2,4}\s+(Key interfaces|Public surface)\b",
+    re.IGNORECASE | re.MULTILINE,
+)
+
+
+def _normalised(path: Path) -> str:
+    return path.as_posix()
+
+
+def _has_documentable_code(directory: Path) -> bool:
+    """True when the directory has at least one `.py` file beyond `__init__.py`."""
+    for entry in directory.iterdir():
+        if entry.is_file() and entry.suffix == ".py" and entry.name != "__init__.py":
+            return True
+    return False
+
+
+def _readme_failure(directory: Path) -> str | None:
+    """Return an error message if the directory's README is missing or stub-sized."""
+    readme = directory / "README.md"
+    if not readme.is_file():
+        return (
+            f"::error file={_normalised(directory)}::missing README.md. "
+            "`CLAUDE.md` requires every `src/` package to document its "
+            "purpose and key interfaces."
+        )
+    body = readme.read_text(encoding="utf-8").strip()
+    if len(body.encode("utf-8")) < MIN_BYTES:
+        return (
+            f"::error file={_normalised(readme)}::README.md is shorter than "
+            f"{MIN_BYTES} bytes after stripping whitespace. Add purpose + "
+            "key-interfaces text — a single heading does not satisfy the rule."
+        )
+    if not KEY_INTERFACES_HEADING.search(body):
+        return (
+            f"::error file={_normalised(readme)}::README.md missing a "
+            "`## Key interfaces` heading (or the synonym `## Public surface`). "
+            "Per CLAUDE.md the README must document the package's public "
+            "surface; the heading anchors that section so contributors can "
+            "find it. Add the heading and list the package's exported names."
+        )
+    return None
+
+
+def main() -> int:
+    if not SRC_ROOT.is_dir():
+        print(f"::error::{SRC_ROOT.as_posix()} not found; run from repo root")
+        return 2
+
+    audited: list[Path] = []
+    failures: list[str] = []
+
+    for directory in sorted(p for p in SRC_ROOT.rglob("*") if p.is_dir()):
+        if directory.name == "__pycache__":
+            continue
+        if not _has_documentable_code(directory):
+            continue
+        audited.append(directory)
+        message = _readme_failure(directory)
+        if message is not None:
+            failures.append(message)
+
+    if failures:
+        for line in failures:
+            print(line)
+        print(
+            f"\n{len(failures)} package(s) failed the README audit. "
+            "Fix in this PR — there is no exemption mechanism, see the "
+            "module docstring."
+        )
+        return 1
+
+    print(
+        f"src/ README audit OK — {len(audited)} package(s) documented "
+        f"(min {MIN_BYTES} bytes, `## Key interfaces` heading required)."
+    )
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -151,6 +151,19 @@ jobs:
           python-version: "3.14"
       - run: python .github/scripts/check_tests_present.py
 
+  src-readmes:
+    name: src/ README audit
+    runs-on: ubuntu-latest
+    # CLAUDE.md: every `src/` package documents its purpose + key
+    # interfaces. The audit checks shape (presence + min 200 bytes) and
+    # structure (`## Key interfaces` heading). No exemption mechanism.
+    steps:
+      - uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5  # v4
+      - uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065  # v5
+        with:
+          python-version: "3.14"
+      - run: python .github/scripts/check_src_readmes.py
+
   frontend-build:
     name: Frontend Build
     runs-on: ubuntu-latest
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "harness-python-react"
-version = "0.2.0"
+version = "0.2.1"
 description = "Production-quality LLM-driven coding harness — Python (FastAPI) backend, Vite + React + TypeScript frontend."
 readme = "README.md"
 requires-python = ">=3.14"
diff --git a/src/api/README.md b/src/api/README.md
@@ -0,0 +1,18 @@
+# `src/api/`
+
+FastAPI HTTP surface for the harness scaffold. Owns request handling, route registration, lifespan hooks, and the in-memory session store. Imports from `src/observability/` (instrumentation), `src/models/` (Pydantic contracts) — never the other way around (enforced by `import-linter`).
+
+## Key interfaces
+
+- **`main.app`** — the `FastAPI()` instance with CORS middleware, OTel auto-instrumentation, and the v1 router mounted. Exposed for `uvicorn src.api.main:app` and the FastAPI `TestClient` fixtures in `tests/conftest.py`.
+- **`main.lifespan(app)`** — async context manager that runs `setup_tracing` → `setup_logging` → `instrument_httpx` → `instrument_fastapi` once on startup, then constructs the `SessionStore`. Single point of "what is process-globally configured?"
+- **`routes.router`** — `APIRouter(prefix="/api/v1")`. Every route a real client should hit lives here; un-versioned routes live on `app` directly (only the FastAPI auto-generated docs at `/docs` and `/redoc` qualify).
+- **`routes.health`** — `GET /api/v1/health` → `HealthResponse` (status + `importlib.metadata.version`-derived build version).
+- **`routes.echo`** — `GET /api/v1/echo?msg=...` → `EchoResponse`. Demonstrates the request/response contract pattern that real domain endpoints follow.
+- **`sessions.SessionStore`** — single-process in-memory dict mapping session id → conversation history. Plumbed onto `app.state.session_store` in lifespan; replace with a Redis or DB-backed implementation for multi-process deployments.
+
+## Conventions
+
+- Every route returns a `StrictModel` subclass — never a raw dict. The `extra="forbid"` config makes typos and renamed fields fail at construction, not three calls deep.
+- Routes are versioned under `/api/v1/`. Adding a v2 endpoint = a new router, never a breaking change to v1.
+- The CORS policy is wide-open in the scaffold so the Vite dev server proxy works on first run; tighten via config when a real auth layer arrives.
diff --git a/src/eval/README.md b/src/eval/README.md
@@ -0,0 +1,22 @@
+# `src/eval/`
+
+Provider-agnostic evaluation harness — golden QA cases driven by a caller-supplied `answer_fn`, three tolerance modes (exact / numeric / semantic), and a `Protocol`-based LLM-judge seam so no SDK leaks into this layer. Designed to be runnable on a fresh clone without LLM credentials (the example case uses `exact_match`); the nightly workflow opt-in is documented in `docs/EVAL_HARNESS.md`.
+
+## Key interfaces
+
+- **`models.EvalCase`** — Pydantic shape of an entry in `eval/golden_qa.json`: `id`, `question`, `expected_answer`, `tolerance`, plus optional `category` / `difficulty` / `expected_tools` / `notes`.
+- **`models.EvalResult`** — per-case outcome: actual answer, latency, pass/fail, tolerance score, failure reason.
+- **`runner.load_golden_dataset(path=None)`** — pure JSON loader. Standalone so call sites can introspect the dataset without paying for an `EvalRunner`.
+- **`runner.EvalRunner(answer_fn, judge_client=None, judge_model="")`** — the orchestrator. `answer_fn: Callable[[str], str]` is the single seam — wire your agent loop, direct LLM client, or stub. `judge_client` implements the `judge.LLMClient` Protocol and is only consulted for `semantic_similar` cases.
+  - **`evaluate(case)`** — run one case; returns `EvalResult`.
+  - **`evaluate_all()`** — load the golden dataset, evaluate every case, return the list.
+- **`judge.LLMClient`** — `Protocol` with one method: `complete_json(*, model, prompt) -> str`. Concrete adapters live in your agent code; the harness never imports OpenAI/Anthropic/Azure SDKs.
+- **`judge.evaluate_semantic_similarity(question, expected, actual, client, model)`** — calls the judge, returns `(score, explanation)`. Returns `(None, "no LLM client configured")` when `client is None` (inconclusive — runner treats it as pass with an explanatory note rather than a hard fail).
+- **`report.generate_report(results)`** — markdown summary: overall accuracy, per-category, per-difficulty, failure analysis with reason text.
+- **`__main__`** — `python -m src.eval` runs the dataset with an identity `answer_fn` (echoes the question) and prints the markdown report.
+
+## Conventions
+
+- **`coverage` `omit`s this package** — the eval suite is exercised by `eval/test_golden_qa.py`, not by `tests/`. Counting it would inflate misses on every PR that touches behaviour without re-running the eval workflow.
+- **No domain coupling.** The harness ships one trivial echo case; real users replace `eval/golden_qa.json` with their domain dataset and wire `answer_fn` to their agent.
+- **Tolerance picks happen per case**, not per runner. A dataset can mix `exact_match`, `numeric_close` (within 1%), and `semantic_similar` (LLM judge ≥ 0.8) cases freely.
diff --git a/src/models/README.md b/src/models/README.md
@@ -0,0 +1,17 @@
+# `src/models/`
+
+Pydantic contracts that cross module or process seams. Import-linter enforces that nothing outside this package matters here — `src.models` depends on **nothing in `src/`** (forbidden contract in `pyproject.toml`). The point: schema bugs surface at construction with clear `ValidationError`s, not as `AttributeError`s deep in the request path.
+
+## Key interfaces
+
+- **`_base.StrictModel`** — `BaseModel` subclass with `model_config = ConfigDict(extra="forbid")`. Inherit this for any contract that crosses a boundary; opt into stricter type coercion with `class Foo(StrictModel, strict=True)` when JSON coercion (UUID, int) is undesirable.
+- **`health.HealthResponse`** — `status: Literal["ok"]` + `version: str`. Returned by `GET /api/v1/health`.
+- **`session.SessionCreate`** / **`session.SessionInfo`** — the create-empty / public-info shapes used by `src.api.sessions.SessionStore`.
+- **`config.Settings`** — `pydantic_settings.BaseSettings` reading the four `LLM_*` env vars (`LLM_PROVIDER`, `LLM_API_KEY`, `LLM_BASE_URL`, `LLM_MODEL`). Provider-pluggable seam — wire OpenAI, Anthropic, Azure, or vLLM by swapping the values without touching this code.
+- **`config.get_settings()`** — fresh `Settings` constructor (deliberately unmemoised so tests can re-construct after `monkeypatch.setenv`). Real callers should hold a single instance at startup.
+
+## Conventions
+
+- One module per logical contract group — `health`, `session`, `config`. Add a new file rather than appending to an existing one.
+- Per-class `strict=True` is the call site for any model that should reject `"3.14"` for a float; HTTP-boundary models often skip it because JSON requires the coercion.
+- `pyproject.toml` `[tool.ruff.lint.per-file-ignores]` adds `"src/models/**" = ["TCH003"]` because Pydantic needs runtime imports for type annotations.
diff --git a/src/observability/README.md b/src/observability/README.md
@@ -0,0 +1,20 @@
+# `src/observability/`
+
+OpenTelemetry SDK setup, OTLP exporter wiring, structured-JSON logging with trace correlation, and span helpers built around the OTel GenAI semantic conventions. Wired once from `src.api.main.lifespan` so the rest of the codebase calls into a configured tracer / logger without orchestrating startup.
+
+## Key interfaces
+
+- **`tracing.setup_tracing()`** — creates the global `TracerProvider`, attaches a `BatchSpanProcessor` with the OTLP gRPC exporter (or `ConsoleSpanExporter` when `OTEL_EXPORTER=console`). Reads `OTEL_SERVICE_NAME` (default `harness-python-react`) and `OTEL_EXPORTER_OTLP_ENDPOINT` (default `http://localhost:4317`).
+- **`tracing.instrument_fastapi(app)`** + **`tracing.instrument_httpx()`** — auto-instrumentation hooks. Each FastAPI request and outbound httpx call gets a span with the standard semantic-convention attributes.
+- **`tracing.get_tracer(name)`** — retrieve a tracer by module name when manual instrumentation is needed.
+- **`logging.setup_logging(level=None)`** — configures stdlib `logging` to emit single-line JSON via `_JSONFormatter`. Reads `LOG_LEVEL` env when no argument is passed. Loops in `LoggingInstrumentor` so the `otelTraceID` / `otelSpanID` fields land on every record.
+- **`logging._JSONFormatter`** — the formatter; tested directly in `tests/test_observability.py` for the trace-correlation contract.
+- **`spans.agent_span(name, attributes=None)`** — context manager that opens a span, sets initial attributes from a `Mapping`, yields the live span for further mutation. Use this rather than calling `tracer.start_as_current_span` directly so the attribute-shape stays consistent.
+- **`spans.set_span_attributes(span, **kwargs)`** — set multiple attributes; `None` values are silently skipped.
+- **`spans.GENAI_*` / `DB_*` constants** — exported attribute-key constants for the OTel GenAI + database semantic conventions. Use these instead of raw strings so a typo is a `NameError`, not a silently-different attribute.
+
+## Conventions
+
+- **Semconv keys only.** The constants at the top of `spans.py` are the single source of attribute names. Adding a custom `agent.foo` attribute is a hard no — extend the semconv list (or wait for upstream OTel to bless one).
+- **Provider-agnostic exporter.** `OTEL_EXPORTER_OTLP_ENDPOINT` is the standard variable; `docker-compose.yml` sets it to `http://jaeger:4317` in the compose network.
+- **Lifespan-only setup.** Don't call `setup_tracing()` from anywhere except `src.api.main.lifespan` — duplicate calls trigger OTel's "Overriding of current TracerProvider" warning, and tests rely on the test-fixture variant that attaches an in-memory exporter to whatever provider exists.
diff --git a/src/tools/README.md b/src/tools/README.md
@@ -0,0 +1,19 @@
+# `src/tools/`
+
+Generic tool registry — the dispatcher pattern an LLM-driven agent uses to call typed tools. Each tool is a function `StrictModel -> StrictModel`; the registry maps a name to `(input_schema, callable)` and validates the dict-shaped input against the schema before dispatch. Layer-wise sits below `agent` / `api` / `eval`, above `data` / `models` (enforced by `import-linter`).
+
+## Key interfaces
+
+- **`registry.Registry`** — instance class. Construct one per agent/test fixture; the module also exposes a global `registry` singleton that the example `echo_tool` self-registers into at import time.
+  - **`register(name, input_schema)`** — decorator factory. Wraps a function and inserts the `(input_schema, fn)` pair under `name`. Raises `ValueError` on duplicate registration.
+  - **`dispatch(name, raw_input)`** — `name`-lookup → `input_schema.model_validate(raw_input)` → call. Pydantic `ValidationError` propagates on bad input (wrong type or unknown keys via `StrictModel.extra="forbid"`).
+  - **`names()`** — sorted list of registered tool names; cheap introspection for the agent's tool-listing prompt.
+- **`registry.UnknownToolError`** — `KeyError` subclass raised by `dispatch` when *name* isn't registered. Caught and rendered as a tool-call-error event by the agent loop.
+- **`registry.registry`** — the module-global `Registry` instance. Real agents use this so the `echo_tool` is reachable from any consumer; tests construct private `Registry()` instances to avoid cross-test contamination.
+- **`registry.echo_tool`** + **`EchoToolInput`** + **`EchoToolOutput`** — example tool demonstrating the layer. Ships pre-registered under name `"echo"` to give a working dispatch path on a fresh clone.
+
+## Conventions
+
+- **Inputs and outputs are `StrictModel`s.** Avoid raw `dict`s — the `extra="forbid"` posture is the entire reason this pattern beats hand-rolled `if/elif` dispatch over JSON.
+- **Tools are pure functions** of their input. State the tool needs (DB connection, HTTP client) is injected via partial / closure at registration time, not via module globals.
+- **Add a tool = add a module.** Real tools beyond the example get their own file under `src/tools/` and self-register the same way `echo_tool` does. Keep `registry.py` itself focused on the dispatcher.
diff --git a/uv.lock b/uv.lock