diff --git a/CLAUDE.md b/CLAUDE.md new file mode 100644 index 0000000..a5e9cfe --- /dev/null +++ b/CLAUDE.md @@ -0,0 +1,82 @@ +# Nous — project conventions + +This file is auto-loaded by Claude Code on every session in this repo. The +rules below are non-negotiable; when they conflict with general AI/coding +defaults, **the rules here win**. + +## 🚫 Tests must NEVER make live LLM calls + +**No unit, integration, or end-to-end test in this repo may make a real +API call to Anthropic, OpenAI, or any other LLM provider. Period.** + +Why this is a hard rule: +- Tests run on every CI build, every contributor's laptop, and every PR + rebase. Live LLM calls would burn tokens for no signal — the test + result depends on what the model said today, not on the code under test. +- Token budget for `nous` is mission-critical. We refuse to spend it on + CI churn. +- Live calls are non-deterministic. A flaky test from a model rephrasing + itself is worse than no test. + +**How to test correctly:** + +| Code under test | How to mock | +|---|---| +| `LLMDispatcher` | Pass `completion_fn=` in the constructor — a callable that returns canned `chat.completions`-shaped objects. See `tests/test_llm_dispatch.py`'s `_make_fake_completion` for the pattern. | +| `CLIDispatcher` (claude -p subprocess) | Patch `orchestrator.cli_dispatch.subprocess.run` — return a `subprocess.CompletedProcess` with the JSON the test wants. See `tests/test_cli_dispatch.py`. | +| `SDKDispatcher` (Claude Agent SDK) | Pass `sdk_runner=` in the constructor — a callable returning `SDKResult`. See `tests/test_sdk_dispatch.py`'s `_ScriptedRunner`. | +| `InlineDispatcher` | Set up the `.nous_response_*` signal file in tmp_path before calling dispatch. | +| Stub-driven flows | Use `StubDispatcher` from `orchestrator.dispatch` — it produces valid schema-conformant artifacts with no LLM at all. | + +**Active enforcement:** `tests/conftest.py` installs an autouse fixture +(`block_live_llm_calls`) that: +1. Strips `OPENAI_API_KEY` and `ANTHROPIC_API_KEY` from the env so any + accidental real-client construction fails loudly instead of silently + billing. +2. Patches `urllib.request.urlopen` to refuse `api.anthropic.com`, + `api.openai.com`, and `api.litellm.ai` hosts. +3. Patches `claude_agent_sdk.query` (when installed) to a hard-fail. + +If a test triggers any of these guards, the fix is to inject a fake at +the dispatcher's seam — never to disable the guard. The guards are the +backstop; the seams are the contract. + +## Behavioral testing only + +When the test mock is in place, write **behavioral** tests: +- ✓ Assert what's on disk after `dispatcher.dispatch(...)`. +- ✓ Assert metrics rows in `llm_metrics.jsonl`. +- ✓ Assert artifacts match a JSON Schema. +- ✗ Don't assert which method was called on the mock. +- ✗ Don't assert argv shape, internal helper invocation, or attribute access. + +The seam is the contract; the implementation is free to evolve. + +## Token-budget discipline (production code) + +Beyond tests, Nous itself must be frugal with tokens: +- **Methodology stays in `CLAUDE.md`** (auto-loaded by Claude Code), not + in per-call prompts. The thin templates in `prompts/methodology/*_thin.md` + carry only per-iteration context. +- **System blocks are cached** (`cache_control: ephemeral`). Any code + that constructs an SDK call with a static system_prompt should rely + on this, and any change that breaks within-iteration cache locality + must be measured (`nous cost --cache-stats`) and justified. +- **Read-only mapping uses Explore subagents**, not Opus. See + `orchestrator/explore_design.py`. + +## PR workflow (project owner: @sriumcp) + +1. Branch off `upstream/reflective` (NOT `main`). +2. Push to `origin` (the fork at `sriumcp/agentic-strategy-evolution`). +3. Open PR with base `upstream/reflective`, head `sriumcp:`. +4. PR body links the issue with `Closes #N` (or `Refs #N` for partials). +5. Stack PRs when one logical change builds on another rather than waiting + for merge — see `docs/plans/CHECKPOINT.md` for the pattern. + +## See also + +- `docs/contributing/workflow.md` — full workflow doc. +- `docs/security.md` — permission policy (#135). +- `docs/architecture.md` — internals. +- `docs/plans/CHECKPOINT.md` — current state of the #120 epic. diff --git a/docs/contributing/workflow.md b/docs/contributing/workflow.md index 4aaa2cf..ecc579a 100644 --- a/docs/contributing/workflow.md +++ b/docs/contributing/workflow.md @@ -4,6 +4,32 @@ This document defines the standard workflow for contributors using Claude Code t --- +## Non-negotiable rules + +These apply to every PR, every test, every contributor. They are also restated in the auto-loaded `CLAUDE.md` files at the repo root and under `tests/`. + +### 🚫 Tests must NEVER make live LLM calls + +**No unit, integration, or end-to-end test in this repo may make a real API call to Anthropic, OpenAI, or any other LLM provider.** Tests must mock LLMs at the dispatcher seam: + +- `LLMDispatcher` → pass `completion_fn=`. +- `CLIDispatcher` → patch `orchestrator.cli_dispatch.subprocess.run`. +- `SDKDispatcher` → pass `sdk_runner=` returning `SDKResult`. +- `InlineDispatcher` → pre-populate the `.nous_response_*` signal file. +- Or use `StubDispatcher` for end-to-end orchestrator flows. + +`tests/conftest.py` installs an autouse `block_live_llm_calls` fixture that strips LLM API keys from the env and patches `urllib.request.urlopen` + `claude_agent_sdk.query` to hard-fail on real network calls. If a test trips the guard, fix the test by injecting a fake — never disable the guard. + +### Behavioral testing only + +Assert what's on disk, what's in metrics rows, what schemas validate. Don't assert which methods were called or what argv was constructed. The dispatcher seams are the contract. + +### Token-budget discipline + +`nous` runs against real LLMs in production; CI cannot. Every PR that touches `orchestrator/` must keep the cache-friendly invariant: methodology lives in `CLAUDE.md` (auto-loaded), system blocks are stable across calls (cache hits), per-iteration content goes in the user message (cache busts when it should). `nous cost --cache-stats` is the regression gate. + +--- + ## Overview Any contributor with Claude Code should follow this workflow when working on an issue. It combines AI-assisted planning and review with explicit human approval gates to produce consistent, high-quality contributions. diff --git a/tests/CLAUDE.md b/tests/CLAUDE.md new file mode 100644 index 0000000..0eff073 --- /dev/null +++ b/tests/CLAUDE.md @@ -0,0 +1,43 @@ +# Tests — local conventions + +This file is auto-loaded whenever Claude Code is operating inside `tests/`. +It restates the non-negotiable rules from the root `CLAUDE.md` so they're +in scope even when the repo root isn't. + +## 🚫 NEVER make live LLM calls in tests + +This applies to **unit, integration, and end-to-end tests alike**. There +is no test category in this repo that's allowed to spend tokens against +a real provider. + +**Active enforcement** (see `tests/conftest.py`): +- `block_live_llm_calls` autouse fixture strips `OPENAI_API_KEY` / + `ANTHROPIC_API_KEY` and patches `urllib.request.urlopen` + `claude_agent_sdk.query` + to hard-fail on real network calls. If a new test trips this guard, + inject a fake at the dispatcher seam — don't disable the guard. + +**Standard injection seams**: +- `LLMDispatcher(..., completion_fn=fake)` — see `_make_fake_completion`. +- `CLIDispatcher` — `monkeypatch.setattr("orchestrator.cli_dispatch.subprocess.run", fake)`. +- `SDKDispatcher(..., sdk_runner=fake)` — see `_ScriptedRunner`. +- `StubDispatcher` for end-to-end orchestrator flows that don't care + about any specific LLM behavior. + +## Behavioral testing only + +- ✓ Assert what's on disk: file existence, JSON Schema validation, contents. +- ✓ Assert metrics-row contents in `llm_metrics.jsonl`. +- ✓ Assert exit codes and stderr substrings for hooks. +- ✗ Don't assert "function X was called with Y" — that's structural. +- ✗ Don't assert argv shape or internal control flow. + +The dispatcher seams (Protocol + dataclass result) are the contract; +the implementation is free to evolve under them. + +## Determinism + +- Inject `now=`, `monkeypatch.time.sleep`, `os.utime` for time-dependent + behavior. Tests must not depend on real wall-clock. +- Inject `pid_check=` for `gc_orphan_worktrees` — never assert on real PIDs. +- Use `_RecordingPoster` / `_ScriptedRunner` patterns to capture arguments + for assertion without coupling to internal call shapes. diff --git a/tests/conftest.py b/tests/conftest.py index 9e9709f..476b4ee 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1,4 +1,5 @@ import json +import urllib.request from pathlib import Path import pytest @@ -9,6 +10,67 @@ TEMPLATES_DIR = Path(__file__).resolve().parent.parent / "orchestrator" / "templates" +# ─── No-live-LLM enforcement (project principle, see CLAUDE.md) ──────────── + + +_BLOCKED_HOSTS = ( + "api.anthropic.com", + "api.openai.com", + "api.litellm.ai", +) + + +class LiveLLMCallBlocked(RuntimeError): + """A test triggered something that would call a real LLM provider. + + The fix is to inject a fake at the dispatcher seam (sdk_runner=, + completion_fn=, monkeypatch subprocess.run, etc.) — NEVER to + disable this guard. See CLAUDE.md. + """ + + +@pytest.fixture(autouse=True) +def block_live_llm_calls(monkeypatch): + """Auto-applied to every test: strip LLM API keys from env and refuse + real network calls to known LLM hosts. + + Tests that legitimately need to construct an OpenAI client should pass + api_key= explicitly (existing tests already do this). Tests that need + to dispatch an agent should inject a fake — see tests/CLAUDE.md. + """ + for var in ("OPENAI_API_KEY", "OPENAI_BASE_URL", "ANTHROPIC_API_KEY"): + monkeypatch.delenv(var, raising=False) + + original_urlopen = urllib.request.urlopen + + def _guarded_urlopen(req, *args, **kwargs): + url = req.full_url if hasattr(req, "full_url") else str(req) + if any(host in url for host in _BLOCKED_HOSTS): + raise LiveLLMCallBlocked( + f"Test attempted urlopen to {url!r} — live LLM calls are " + "forbidden. Inject a fake at the dispatcher seam. See CLAUDE.md." + ) + return original_urlopen(req, *args, **kwargs) + + monkeypatch.setattr(urllib.request, "urlopen", _guarded_urlopen) + + # Patch claude_agent_sdk.query if installed; this catches accidental + # uses of the default sdk_runner path. + try: + import claude_agent_sdk # type: ignore[import-not-found] + + async def _blocked_query(*args, **kwargs): + raise LiveLLMCallBlocked( + "Test invoked claude_agent_sdk.query — pass sdk_runner= " + "to SDKDispatcher with a fake. See CLAUDE.md." + ) + yield # pragma: no cover (makes the function an async generator) + + monkeypatch.setattr(claude_agent_sdk, "query", _blocked_query) + except ImportError: + pass + + @pytest.fixture def schemas_dir(): return SCHEMAS_DIR diff --git a/tests/test_no_live_llm_guard.py b/tests/test_no_live_llm_guard.py new file mode 100644 index 0000000..7e079a5 --- /dev/null +++ b/tests/test_no_live_llm_guard.py @@ -0,0 +1,70 @@ +"""Meta-tests: verify the conftest's no-live-LLM guard actually fires. + +If these tests stop passing, the guard is broken — and a real test could +silently make a live API call. CI should fail loudly. +""" +from __future__ import annotations + +import os +import urllib.error +import urllib.request + +import pytest + +from tests.conftest import LiveLLMCallBlocked + + +class TestEnvKeysStripped: + """The guard removes LLM API key env vars so any code that reads them + sees ``None`` and falls back to the disabled-mode path.""" + + def test_openai_api_key_unset(self): + assert os.environ.get("OPENAI_API_KEY") is None + + def test_anthropic_api_key_unset(self): + assert os.environ.get("ANTHROPIC_API_KEY") is None + + +class TestUrlopenGuard: + """Direct urllib.request.urlopen calls to LLM hosts must raise.""" + + @pytest.mark.parametrize("host", [ + "https://api.anthropic.com/v1/messages", + "https://api.openai.com/v1/chat/completions", + ]) + def test_blocked_host_raises(self, host): + with pytest.raises(LiveLLMCallBlocked): + urllib.request.urlopen(host) + + def test_non_blocked_host_passes_through_signature(self): + """The guard is a substring check on known LLM hosts; calls to + other URLs are NOT blocked by this fixture (so tests that legitimately + post to e.g. a Slack webhook still go through their own injection).""" + # We don't actually call out to the network — just assert the guard + # has correct shape for a non-blocked URL. + # (The guard delegates to the original urlopen for non-blocked URLs.) + try: + urllib.request.urlopen("http://localhost:1/", timeout=0.01) + except LiveLLMCallBlocked: + pytest.fail("guard wrongly blocked a non-LLM host") + except (urllib.error.URLError, OSError, TimeoutError): + pass # expected — connection refused / no listener + + +class TestSDKQueryGuard: + """When claude_agent_sdk is installed, the guard replaces query() with + a hard-fail. SDKDispatcher tests inject a fake sdk_runner instead.""" + + def test_sdk_query_blocked_when_installed(self): + try: + import claude_agent_sdk # type: ignore[import-not-found] + except ImportError: + pytest.skip("claude-agent-sdk not installed; nothing to guard") + + async def _drive(): + async for _ in claude_agent_sdk.query(prompt="x", options=None): + pass + + import anyio + with pytest.raises(LiveLLMCallBlocked): + anyio.run(_drive)