From 3632db837e3c0120a67b7892be8df170092b1f9c Mon Sep 17 00:00:00 2001 From: Claude Date: Mon, 8 Jun 2026 05:16:51 +0000 Subject: [PATCH 1/3] Trinity: automated CLI trial harness (Gemini/Codex headless runs) Replaces manual VS Code chat-panel coordination with a headless, observable, CLI-driven trial runner for the coordination layer. - bin/trial: list | doctor | validate | preflight | run - driver abstraction (gemini --yolo, codex exec --full-auto, claude -p) + a deterministic mock driver so the battery runs with no API keys - deterministic PROJECT-SPEC parser (ingestion stage 1) with validation - preflight question round + human gate before any work - isolated per-run workspace (own .tick/ + throwaway git repo) - observability: run.jsonl spine, per-agent transcripts, report/SUMMARY.md - battery: 2 build + 2 debug scenarios; test/smoke.sh green 11/11 https://claude.ai/code/session_01WnzAdCRGrrhukvW1etFLyB --- CHANGELOG.md | 8 + PROJECT/2-WORKING/P1-TRINITY-ROUND2.md | 21 ++ .../coordination-layer/harness/.gitignore | 3 + .../coordination-layer/harness/README.md | 139 +++++++++++ .../coordination-layer/harness/bin/trial | 229 ++++++++++++++++++ .../harness/prompts/agent-loop.md | 60 +++++ .../harness/prompts/preflight.md | 36 +++ .../coordination-layer/harness/src/drivers.js | 102 ++++++++ .../harness/src/mock-agent.js | 191 +++++++++++++++ .../coordination-layer/harness/src/observe.js | 57 +++++ .../harness/src/preflight.js | 97 ++++++++ .../coordination-layer/harness/src/proc.js | 49 ++++ .../coordination-layer/harness/src/prompts.js | 74 ++++++ .../coordination-layer/harness/src/run.js | 204 ++++++++++++++++ .../coordination-layer/harness/src/spec.js | 201 +++++++++++++++ .../coordination-layer/harness/test/smoke.sh | 52 ++++ .../harness/trials/build-todo-api.project.md | 76 ++++++ .../trials/build-url-shortener.project.md | 51 ++++ .../harness/trials/debug-calc-bugs.project.md | 39 +++ .../trials/debug-poisoned-task.project.md | 38 +++ .../fixtures/calc-bugs/.solutions/add.js | 3 + .../fixtures/calc-bugs/.solutions/mul.js | 3 + .../trials/fixtures/calc-bugs/package.json | 9 + .../trials/fixtures/calc-bugs/src/add.js | 4 + .../trials/fixtures/calc-bugs/src/mul.js | 4 + .../fixtures/calc-bugs/test/add.test.js | 9 + .../fixtures/calc-bugs/test/mul.test.js | 9 + .../fixtures/poison/.solutions/parse.js | 6 + .../trials/fixtures/poison/package.json | 7 + .../trials/fixtures/poison/src/parse.js | 6 + .../trials/fixtures/poison/test/parse.test.js | 9 + 31 files changed, 1796 insertions(+) create mode 100644 experiments/coordination-layer/harness/.gitignore create mode 100644 experiments/coordination-layer/harness/README.md create mode 100755 experiments/coordination-layer/harness/bin/trial create mode 100644 experiments/coordination-layer/harness/prompts/agent-loop.md create mode 100644 experiments/coordination-layer/harness/prompts/preflight.md create mode 100644 experiments/coordination-layer/harness/src/drivers.js create mode 100755 experiments/coordination-layer/harness/src/mock-agent.js create mode 100644 experiments/coordination-layer/harness/src/observe.js create mode 100644 experiments/coordination-layer/harness/src/preflight.js create mode 100644 experiments/coordination-layer/harness/src/proc.js create mode 100644 experiments/coordination-layer/harness/src/prompts.js create mode 100644 experiments/coordination-layer/harness/src/run.js create mode 100644 experiments/coordination-layer/harness/src/spec.js create mode 100755 experiments/coordination-layer/harness/test/smoke.sh create mode 100644 experiments/coordination-layer/harness/trials/build-todo-api.project.md create mode 100644 experiments/coordination-layer/harness/trials/build-url-shortener.project.md create mode 100644 experiments/coordination-layer/harness/trials/debug-calc-bugs.project.md create mode 100644 experiments/coordination-layer/harness/trials/debug-poisoned-task.project.md create mode 100644 experiments/coordination-layer/harness/trials/fixtures/calc-bugs/.solutions/add.js create mode 100644 experiments/coordination-layer/harness/trials/fixtures/calc-bugs/.solutions/mul.js create mode 100644 experiments/coordination-layer/harness/trials/fixtures/calc-bugs/package.json create mode 100644 experiments/coordination-layer/harness/trials/fixtures/calc-bugs/src/add.js create mode 100644 experiments/coordination-layer/harness/trials/fixtures/calc-bugs/src/mul.js create mode 100644 experiments/coordination-layer/harness/trials/fixtures/calc-bugs/test/add.test.js create mode 100644 experiments/coordination-layer/harness/trials/fixtures/calc-bugs/test/mul.test.js create mode 100644 experiments/coordination-layer/harness/trials/fixtures/poison/.solutions/parse.js create mode 100644 experiments/coordination-layer/harness/trials/fixtures/poison/package.json create mode 100644 experiments/coordination-layer/harness/trials/fixtures/poison/src/parse.js create mode 100644 experiments/coordination-layer/harness/trials/fixtures/poison/test/parse.test.js diff --git a/CHANGELOG.md b/CHANGELOG.md index d05bc49..7e7132e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -13,6 +13,14 @@ This project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.htm - Do not edit a version block that has already been committed and pushed --> +## [2.2.0] - 2026-06-08 + +### Added +- **Trinity trial harness** (`experiments/coordination-layer/harness/`) — drives automated, headless multi-agent coordination trials from the system CLI, replacing the manual "paste prompts into VS Code chat panels and babysit" workflow used in Trinity Runs 1–3. `trial run ` parses a structured project spec, runs a preflight question round with a human gate, seeds an isolated `.tick/` backlog, spawns each agent CLI (Gemini/Codex/Claude) concurrently and headlessly, then scores the run with `tick analyze`. Full observability per run: structured `run.jsonl` spine, per-agent transcripts, and a `report/SUMMARY.md`. +- **Driver abstraction + mock driver** (`harness/src/drivers.js`, `mock-agent.js`) — headless invocation specs for `gemini --yolo`, `codex exec --full-auto`, and `claude -p`, each env-overridable; a deterministic mock driver speaks the full `tick take → done|break` protocol so the entire battery runs (and the harness is validated) with no API keys. +- **Deterministic spec parser** (`harness/src/spec.js`) — implements stage 1 of the previously-scaffolded ingestion pipeline: parses `PROJECT-SPEC` markdown into validated task events, hard-failing on duplicate IDs, empty scopes, non-numeric priority, and dependency cycles. +- **Trial battery** (`harness/trials/`) — two build scenarios (Todo API, URL shortener) and two debug scenarios (seeded-bug fix-iterate, poisoned-task circuit-breaker) with fixtures, plus a `test/smoke.sh` that runs all four on the mock driver (11/11 green). + ## [2.1.6] - 2026-04-23 ### Changed diff --git a/PROJECT/2-WORKING/P1-TRINITY-ROUND2.md b/PROJECT/2-WORKING/P1-TRINITY-ROUND2.md index 80fb0a3..c5b37b3 100644 --- a/PROJECT/2-WORKING/P1-TRINITY-ROUND2.md +++ b/PROJECT/2-WORKING/P1-TRINITY-ROUND2.md @@ -150,6 +150,27 @@ and remove the `tick claim` instruction. 3. Coordinator integrates the two halves and boots the app. 4. Append a Run 3 section to `RECAP.md` and update this doc's status. +### Run 3 automation — CLI trial harness (2026-06-08) + +Run 3 no longer requires manual chat-panel coordination. The +[`experiments/coordination-layer/harness/`](../../experiments/coordination-layer/harness/README.md) +harness drives the whole trial from the system CLI: it parses a structured spec, +runs a preflight question round with a human gate, seeds the backlog, spawns +**Gemini CLI and Codex CLI headlessly and concurrently** (`gemini --yolo`, +`codex exec --full-auto`), captures full transcripts, and runs `tick analyze` — +producing the concurrent-claim-time metric automatically. To run Run 3: + +```bash +cd experiments/coordination-layer/harness +node bin/trial doctor # confirm gemini + codex are installed with keys +node bin/trial run build-todo-api # the Run 2/3 6-task split, now automated +``` + +The mock driver (`--agents gemini:mock,codex:mock`) validates the harness with no +keys; `bash test/smoke.sh` runs the full build+debug battery. The +load-bearing open question is unchanged — only a **real-CLI** run answers whether +Gemini/Codex actually comply with the integration prompt. + --- ## Open questions for Run 3+ diff --git a/experiments/coordination-layer/harness/.gitignore b/experiments/coordination-layer/harness/.gitignore new file mode 100644 index 0000000..6e209a8 --- /dev/null +++ b/experiments/coordination-layer/harness/.gitignore @@ -0,0 +1,3 @@ +# Per-run trial artifacts (workspaces, transcripts, reports). Regenerated on +# every `trial run`; never commit them. +runs/ diff --git a/experiments/coordination-layer/harness/README.md b/experiments/coordination-layer/harness/README.md new file mode 100644 index 0000000..6223769 --- /dev/null +++ b/experiments/coordination-layer/harness/README.md @@ -0,0 +1,139 @@ +# Trinity trial harness — automated multi-agent runs from the CLI + +**Question this answers:** *Can the Trinity coordination layer use Gemini CLI and +Codex CLI to run automated trials, instead of a human manually coordinating agent +chat sessions in VS Code?* + +**Answer: yes.** The coordination layer (`../bin/tick`) was already fully +headless. The only manual part of Runs 1–3 was *driving the agents* — pasting +prompts into VS Code chat panels and babysitting. This harness automates exactly +that: it seeds a backlog from a structured spec, runs a preflight question round, +spawns each agent CLI **headlessly and concurrently**, captures everything, and +scores the run with `tick analyze`. No human in the loop during the run. + +``` +spec (.md) ─▶ parse ─▶ preflight Q&A ─▶ [human gate] ─▶ seed .tick/ ─▶ spawn agents ─▶ analyze ─▶ report + stage 1 agents ask review & backlog gemini/codex/… metrics SUMMARY.md + (deterministic) clarifying answer (isolated) in parallel + questions +``` + +## Quick start + +```bash +cd experiments/coordination-layer/harness + +node bin/trial list # the battery of trial specs +node bin/trial doctor # which agent CLIs are installed +node bin/trial validate debug-calc-bugs # parse + validate a spec + +# Real run (needs gemini + codex installed with API keys): +node bin/trial run build-todo-api + +# Harness self-test with the deterministic mock driver (no keys needed): +node bin/trial run build-todo-api --agents gemini:mock,codex:mock --auto +bash test/smoke.sh # runs the whole battery on the mock +``` + +## How a run works + +1. **Parse + validate** the spec (`src/spec.js`) — this is the deterministic + "stage 1" the [ingestion scaffold](../ingestion/README.md) described: + duplicate IDs, empty scopes, non-numeric priority, dependency cycles all + hard-fail before anything spawns. +2. **Isolated workspace** — each run gets `runs/-/workspace/`, its own + throwaway git repo with its own `.tick/` state (`TICK_REPO_ROOT` points at + it). The real repo's `.tick/` is never touched, so trials are safe to run + repeatedly and in parallel. +3. **Preflight + human gate** — every agent gets a read-only prompt and may emit + `QUESTION:` lines (or `NO_QUESTIONS`). Questions are collected to + `preflight/QUESTIONS.md` and the run **pauses** for a human to review/answer + by editing the spec. `--auto` skips the gate; `--skip-preflight` skips the + round entirely. +4. **Seed** the backlog — one `tick log task.created` per task. +5. **Spawn agents concurrently** — each driver runs its CLI headlessly with the + integration prompt (`prompts/agent-loop.md`), looping + `tick take → work → tick done|break`. +6. **Verify + analyze** — runs the project `Verify` command, then `tick analyze`, + and writes `report/SUMMARY.md`. + +## Observability + +Everything a run does is recorded so it's fully reconstructable: + +| Artifact | What's in it | +|---|---| +| `run.jsonl` | structured spine — one JSON line per harness event (`seed.task`, `agent.spawn`, `agent.exit`, `verify.result`, `trial.end`, …) | +| `logs/.log` | raw stdout+stderr transcript of each agent process | +| `prompt-.md` | the exact prompt each agent received | +| `preflight/QUESTIONS.md` | clarifying questions per agent | +| `report/analyze.json` / `.md` | full `tick analyze` output (claims, dones, breaks, concurrent-claim time) | +| `report/SUMMARY.md` | human-readable verdict + per-agent table | +| `spec.json` | the parsed, resolved spec | + +## The battery + +| Spec | Kind | Tasks | Exercises | +|---|---|---|---| +| `build-todo-api` | build | 6 | two-half path routing (HTTP / store), claim cap | +| `build-url-shortener` | build | 4 | three-concern routing (codec / store / http), contract deps | +| `debug-calc-bugs` | debug | 2 | fix-iterate: seeded failing tests → green | +| `debug-poisoned-task` | debug | 2 | circuit-breaker: one fixable bug + one unsatisfiable task | + +Debug specs carry a **Fixture** (seeded-bug code under `trials/fixtures/`) and a +per-task **Verify** command. The fixtures include a `.solutions/` dir and a +`Mock-solution:` mapping so the mock driver can drive red→green deterministically +— **real drivers ignore that field and actually debug.** + +## Drivers + +Headless invocation per agent (`src/drivers.js`). Flags are overridable with env +vars so you can tune them without editing code (CLIs move fast): + +| Driver | Run invocation | Override env | +|---|---|---| +| `gemini` | `gemini --yolo` (prompt on stdin) | `GEMINI_CMD`, `GEMINI_ARGS` | +| `codex` | `codex exec --full-auto -` | `CODEX_CMD`, `CODEX_ARGS` | +| `claude` | `claude -p --permission-mode acceptEdits` | `CLAUDE_CMD`, `CLAUDE_ARGS` | +| `mock` | deterministic in-process stand-in | — | + +`--agents id:driver,id:driver` overrides the roster, e.g. +`--agents gemini:gemini,codex:codex` for a real run, or +`--agents a:mock,b:mock` to validate the harness anywhere. + +## Running for real (gemini + codex) + +1. Install both CLIs and set their API keys: + - Gemini CLI — `GEMINI_API_KEY` (or its configured auth). + - Codex CLI — `OPENAI_API_KEY` (or its configured auth). +2. `node bin/trial doctor` — confirm both show ✅. +3. `node bin/trial run build-todo-api` — preflight pauses at the gate; review + `preflight/QUESTIONS.md`, edit the spec if needed, then re-run with + `--skip-preflight` (or `--auto`). +4. Read `report/SUMMARY.md`. The load-bearing metric is **concurrent-claim time** + (target ≥ 50%, per the Run 3 success criterion in + [`P1-TRINITY-ROUND2.md`](../../../PROJECT/2-WORKING/P1-TRINITY-ROUND2.md)). + +## Adding a trial + +Copy any `trials/*.project.md`, following +[`../ingestion/PROJECT-SPEC.template.md`](../ingestion/PROJECT-SPEC.template.md). +Project metadata (`**Key:** value`) goes above the first `##` heading; one +`### TASK-` block per task. For a debug trial, add a `Fixture:` dir and a +per-task `Verify:` command (plus `Mock-solution:` if you want the mock to drive +it green). `node bin/trial validate ` checks it before you run. + +## Limitations / honest notes + +- **The mock is not a real agent.** It proves the *harness* — protocol calls, + concurrency, observability, red→green, circuit-break — not that Gemini/Codex + will comply with the prompt. That's still the load-bearing open question from + the Run 1/2 retros; only a real-CLI run answers it. +- **Mock concurrent-claim time runs ~30–40%** because simulated work is fast and + tasks are few. Real agents (minutes per task) should overlap far more; the + metric is computed identically either way. +- **The claim lock is shared across agents** — simultaneous `tick take` calls + collide and the loser must retry. The mock retries with backoff; the real + agent prompt instructs the same. If real runs show this as friction, the + Phase-2 fix is a per-agent lock or a queue. +- Single branch / single host only, same as the underlying `tick` protocol. diff --git a/experiments/coordination-layer/harness/bin/trial b/experiments/coordination-layer/harness/bin/trial new file mode 100755 index 0000000..05cd7a2 --- /dev/null +++ b/experiments/coordination-layer/harness/bin/trial @@ -0,0 +1,229 @@ +#!/usr/bin/env node +'use strict'; + +// trial — drive a multi-agent coordination trial from the system CLI. +// +// Replaces the manual "paste prompts into VS Code chat panels and babysit" +// workflow with a repeatable, observable, scriptable run: +// +// trial list list the battery of trial specs +// trial doctor show which agent CLIs are installed +// trial validate parse + validate a spec (ingestion stage 1) +// trial preflight [opts] let agents ask clarifying questions, then gate +// trial run [opts] preflight → human gate → run → analyze → report +// +// Options: +// --agents id:driver,id:driver override participants (driver ∈ gemini|codex|claude|mock) +// --skip-preflight skip the question round +// --auto don't stop at the human gate even if there are questions +// --timeout per-agent wall-clock cap (default 600000) +// --max-attempts fix-iterate attempts before an agent circuit-breaks (default 3) + +const fs = require('fs'); +const path = require('path'); +const { parseSpecFile } = require('../src/spec'); +const { DRIVERS, getDriver } = require('../src/drivers'); +const { Recorder } = require('../src/observe'); +const { runPreflight } = require('../src/preflight'); +const { runTrial, prepareWorkspace } = require('../src/run'); + +const HARNESS_ROOT = path.join(__dirname, '..'); +const TRIALS_DIR = path.join(HARNESS_ROOT, 'trials'); +const RUNS_DIR = path.join(HARNESS_ROOT, 'runs'); + +function parseArgs(argv) { + const positional = []; + const flags = {}; + for (let i = 0; i < argv.length; i++) { + const a = argv[i]; + if (a.startsWith('--')) { + const key = a.slice(2); + const nx = argv[i + 1]; + if (nx === undefined || nx.startsWith('--')) flags[key] = true; + else { flags[key] = nx; i++; } + } else positional.push(a); + } + return { positional, flags }; +} + +function resolveSpecPath(arg) { + if (fs.existsSync(arg)) return path.resolve(arg); + const inTrials = path.join(TRIALS_DIR, arg.endsWith('.md') ? arg : `${arg}.project.md`); + if (fs.existsSync(inTrials)) return inTrials; + throw new Error(`spec not found: ${arg} (looked in cwd and ${path.relative(process.cwd(), TRIALS_DIR)}/)`); +} + +// Resolve the agent roster. Default: the spec's `Agents:` list, each mapped to +// a driver of the same name. `--agents` overrides with explicit id:driver pairs. +function resolveAgents(spec, flags) { + if (typeof flags.agents === 'string') { + return flags.agents.split(',').map(s => { + const [id, driver] = s.split(':'); + return { id: id.trim(), driver: (driver || id).trim() }; + }); + } + const names = spec.project.agents.length ? spec.project.agents : ['gemini', 'codex']; + return names.map(n => ({ id: n, driver: n })); +} + +function checkDriversAvailable(agents) { + const missing = []; + for (const a of agents) { + const d = getDriver(a.driver); + if (!d.available()) missing.push(`${a.id} (driver: ${a.driver})`); + } + return missing; +} + +function newRunDir(specPath) { + const base = path.basename(specPath).replace(/\.project\.md$|\.md$/, ''); + const ts = new Date().toISOString().replace(/[:.]/g, '-'); + return path.join(RUNS_DIR, `${base}-${ts}`); +} + +function cmdList() { + if (!fs.existsSync(TRIALS_DIR)) { console.log('(no trials dir)'); return 0; } + const specs = fs.readdirSync(TRIALS_DIR).filter(f => f.endsWith('.project.md')); + console.log('Available trial specs:\n'); + for (const s of specs) { + try { + const { project, tasks } = parseSpecFile(path.join(TRIALS_DIR, s)); + console.log(` ${s.replace('.project.md', '')}`); + console.log(` ${project.kind} · ${tasks.length} tasks · agents: ${project.agents.join(', ') || 'gemini, codex'}`); + console.log(` ${project.goal.slice(0, 90)}`); + } catch (e) { + console.log(` ${s} — INVALID: ${e.message.split('\n')[0]}`); + } + console.log(''); + } + return 0; +} + +function cmdDoctor() { + console.log('Agent CLI availability:\n'); + for (const [name, d] of Object.entries(DRIVERS)) { + const ok = d.available(); + console.log(` ${ok ? '✅' : '❌'} ${name}${name === 'mock' ? ' (always available)' : ''}`); + } + console.log('\nIf a real CLI is ❌: install it and ensure its API key is set, or run with'); + console.log('`--agents :mock,...` to exercise the harness with the deterministic mock.'); + return 0; +} + +function cmdValidate(specArg) { + const specPath = resolveSpecPath(specArg); + const { project, tasks } = parseSpecFile(specPath); + console.log(`✅ valid: ${project.name} (${project.kind})`); + console.log(` agents: ${project.agents.join(', ') || '(default gemini, codex)'}`); + console.log(` max claims/agent: ${project.max_active_claims_per_agent}`); + if (project.fixture) console.log(` fixture: ${project.fixture}`); + if (project.verify) console.log(` verify: ${project.verify}`); + console.log(` tasks (${tasks.length}):`); + for (const t of tasks) { + console.log(` - ${t.id} [p${t.priority}] ${t.paths.join(', ')}${t.depends_on.length ? ` (deps: ${t.depends_on.join(',')})` : ''}`); + } + return 0; +} + +async function cmdPreflight(specArg, flags) { + const specPath = resolveSpecPath(specArg); + const spec = parseSpecFile(specPath); + const specDir = path.dirname(specPath); + const agents = resolveAgents(spec, flags); + + const missing = checkDriversAvailable(agents); + if (missing.length) { + console.error(`✗ missing agent CLIs: ${missing.join('; ')}`); + console.error(' install them, or pass --agents :mock to use the mock driver.'); + return 1; + } + + const runDir = newRunDir(specPath); + const rec = new Recorder(runDir); + const resolvedSpecPath = path.join(runDir, 'spec.json'); + fs.writeFileSync(resolvedSpecPath, JSON.stringify(spec, null, 2)); + + const workspace = prepareWorkspace({ rec, spec, specDir }); + const result = await runPreflight({ + rec, spec, agents, workdir: workspace, tickBin: require('../src/run').TICK_BIN, + specPath: resolvedSpecPath, timeoutMs: Number(flags.timeout) || 300000, + }); + + console.log(`\nPreflight complete: ${result.totalQuestions} question(s) across ${agents.length} agent(s).`); + console.log(`Questions: ${path.relative(process.cwd(), result.questionsPath)}`); + console.log(`Run dir: ${path.relative(process.cwd(), runDir)}`); + return 0; +} + +async function cmdRun(specArg, flags) { + const specPath = resolveSpecPath(specArg); + const spec = parseSpecFile(specPath); + const specDir = path.dirname(specPath); + const agents = resolveAgents(spec, flags); + + const missing = checkDriversAvailable(agents); + if (missing.length) { + console.error(`✗ missing agent CLIs: ${missing.join('; ')}`); + console.error(' install them, or pass --agents :mock to use the mock driver.'); + return 1; + } + + const runDir = newRunDir(specPath); + const rec = new Recorder(runDir); + const resolvedSpecPath = path.join(runDir, 'spec.json'); + fs.writeFileSync(resolvedSpecPath, JSON.stringify(spec, null, 2)); + + console.log(`\n▶ trial: ${spec.project.name} [${agents.map(a => `${a.id}:${a.driver}`).join(', ')}]`); + console.log(` run dir: ${path.relative(process.cwd(), runDir)}\n`); + + const workspace = prepareWorkspace({ rec, spec, specDir }); + const TICK_BIN = require('../src/run').TICK_BIN; + + // --- Preflight + human gate --- + if (!flags['skip-preflight']) { + const pf = await runPreflight({ + rec, spec, agents, workdir: workspace, tickBin: TICK_BIN, + specPath: resolvedSpecPath, timeoutMs: Number(flags.timeout) || 300000, + }); + if (pf.totalQuestions > 0 && !flags.auto) { + console.log(`\n⏸ Preflight raised ${pf.totalQuestions} question(s). Run paused at the human gate.`); + console.log(` Review: ${path.relative(process.cwd(), pf.questionsPath)}`); + console.log(' Edit the spec to answer anything material, then re-run with'); + console.log(' --skip-preflight (or --auto to proceed past the gate immediately).'); + return 0; + } + if (pf.totalQuestions > 0) { + console.log(`\n(--auto) proceeding past ${pf.totalQuestions} preflight question(s).`); + } + } + + // --- Run --- + const result = await runTrial({ + rec, spec, specPath: resolvedSpecPath, agents, workspace, + timeoutMs: Number(flags.timeout) || 600000, + maxAttempts: Number(flags['max-attempts']) || 3, + }); + + console.log('\n' + result.summary); + console.log(`Full report: ${path.relative(process.cwd(), path.join(runDir, 'report'))}/`); + return result.verify && !result.verify.ok ? 1 : 0; +} + +async function main(argv) { + const { positional, flags } = parseArgs(argv); + const verb = positional[0]; + switch (verb) { + case 'list': return cmdList(); + case 'doctor': return cmdDoctor(); + case 'validate': return cmdValidate(positional[1]); + case 'preflight': return await cmdPreflight(positional[1], flags); + case 'run': return await cmdRun(positional[1], flags); + default: + console.log(fs.readFileSync(__filename, 'utf8').split('\n').slice(2, 24).map(l => l.replace(/^\/\/ ?/, '')).join('\n')); + return verb ? 2 : 0; + } +} + +main(process.argv.slice(2)) + .then(code => process.exit(code || 0)) + .catch(err => { console.error(`trial: error: ${err.message}`); process.exit(1); }); diff --git a/experiments/coordination-layer/harness/prompts/agent-loop.md b/experiments/coordination-layer/harness/prompts/agent-loop.md new file mode 100644 index 0000000..69e0e45 --- /dev/null +++ b/experiments/coordination-layer/harness/prompts/agent-loop.md @@ -0,0 +1,60 @@ +You are **{{AGENT}}**, one of several AI coding agents working the same codebase +**concurrently** under a coordination protocol. Other agents are working right +now. Your job is to complete tasks from a shared backlog **without colliding** +with them. Read this whole message before doing anything. + +## Project + +**{{PROJECT_NAME}}** ({{PROJECT_KIND}} trial) +{{PROJECT_GOAL}} + +Your working directory is `{{WORKDIR}}`. A coordination CLI is available as +`{{TICK}}` (run it from the working directory). It is the ONLY way you and the +other agents stay out of each other's way — use it exactly as described. + +## The backlog + +{{TASK_TABLE}} + +### Task details + +{{TASK_DETAILS}} + +## The protocol — follow this loop exactly + +Repeat until there are no tasks left for you: + +1. **Claim atomically:** run `{{TICK}} take --agent {{AGENT}}`. + - Output `won: TASK-XXX ...` → that task is yours. Continue. + - Output `(no available task)` → STOP. You are done. Exit. + - Output `(claim limit reached ...)` → finish a task you already hold first. + - Error `lock held` / `claim is in progress` → another agent is claiming at the + same instant. Wait ~1s and retry `take`; do not give up. + You may hold at most **{{MAX_CLAIMS}}** active claims at once. + +2. **Confirm scope:** run `{{TICK}} info TASK-XXX` to see the exact declared + paths for the task. **Only edit files inside those paths.** If you discover + you need files outside them, run + `{{TICK}} scope TASK-XXX --agent {{AGENT}} --paths ","` BEFORE + touching them — this tells the other agents to stay clear. + +3. **Do the work.** Implement the task to meet its Acceptance criteria. If the + task has a Verify command, run it and make it pass. + +4. **Finish or break:** + - Success → `{{TICK}} done TASK-XXX --agent {{AGENT}} --note ""`. + - Genuinely stuck after **{{MAX_ATTEMPTS}}** real attempts → + `{{TICK}} break TASK-XXX --agent {{AGENT}} --reason ""` and move on. + Do NOT burn unlimited attempts on a poisoned task. + +5. Go back to step 1. + +## Rules + +- **Never edit a file outside your current claim's declared paths.** That is the + one rule that prevents collisions. If in doubt, `{{TICK}} scope` first. +- **Never claim by hand-editing `.tick/` files.** Only use the CLI verbs above. +- Use `--agent {{AGENT}}` on every command. That is your identity; do not use + another agent's name. +- Run the project verify when you think you're done: `{{PROJECT_VERIFY}}` +- Work efficiently and exit cleanly when `take` reports no available task. diff --git a/experiments/coordination-layer/harness/prompts/preflight.md b/experiments/coordination-layer/harness/prompts/preflight.md new file mode 100644 index 0000000..f402f18 --- /dev/null +++ b/experiments/coordination-layer/harness/prompts/preflight.md @@ -0,0 +1,36 @@ +You are **{{AGENT}}**, about to join a concurrent multi-agent coding trial. This +is the **preflight** step: before any code is written, you get one chance to ask +clarifying questions about the task definition. **Do not write or edit any files +now.** Just read and ask. + +## Project + +**{{PROJECT_NAME}}** ({{PROJECT_KIND}} trial) +{{PROJECT_GOAL}} + +Working directory (read-only for now): `{{WORKDIR}}` + +## The backlog you'll be working from + +{{TASK_TABLE}} + +### Task details + +{{TASK_DETAILS}} + +## What to do + +Review the project and the tasks. Consider: +- Are any task scopes ambiguous or overlapping in a way that would cause two + agents to collide? +- Is any acceptance criterion untestable or unclear? +- Are there shared files (config, schema, fixtures) that no task clearly owns? +- Is anything underspecified for you to start cleanly? + +**Respond with EITHER:** +- One line per question, each starting with `QUESTION:` — e.g. + `QUESTION: Does TASK-A2 own the shared router file, or is that TASK-A1's?` +- **or** a single line `NO_QUESTIONS` if the spec is clear enough to start. + +Keep it short. These questions go to a human who will edit the spec before the +run begins. diff --git a/experiments/coordination-layer/harness/src/drivers.js b/experiments/coordination-layer/harness/src/drivers.js new file mode 100644 index 0000000..3c4436b --- /dev/null +++ b/experiments/coordination-layer/harness/src/drivers.js @@ -0,0 +1,102 @@ +'use strict'; + +// Driver abstraction: how to invoke each coding agent *headlessly* (no chat UI, +// no human in the loop) from a system CLI. This is the whole point of the +// harness — it replaces "Noel pastes a prompt into the VS Code chat panel and +// babysits" with a spawnable subprocess. +// +// Each driver returns a spawn spec: { cmd, args, input }. The prompt is fed on +// stdin (uniform across CLIs, avoids arg-length limits and quoting hell). The +// runner sets cwd + env (TICK_REPO_ROOT etc.) on the child. +// +// Real-CLI flags can be overridden with env vars (GEMINI_CMD/GEMINI_ARGS, etc.) +// so the binary/flags can be tuned without editing this file — CLIs move fast. + +const { execSync } = require('child_process'); +const path = require('path'); + +function which(bin) { + try { + execSync(`command -v ${bin}`, { stdio: ['ignore', 'pipe', 'ignore'] }); + return true; + } catch { + return false; + } +} + +function envArgs(name) { + const v = process.env[name]; + return v ? v.split(' ').filter(Boolean) : null; +} + +// --- gemini ----------------------------------------------------------------- +// Headless: pipe the prompt on stdin. `--yolo` auto-approves tool calls so the +// agent can actually edit files unattended. Preflight needs no tools. +const gemini = { + name: 'gemini', + bin: process.env.GEMINI_CMD || 'gemini', + available() { return which(this.bin); }, + spawnSpec({ mode, prompt }) { + const args = envArgs('GEMINI_ARGS') || (mode === 'run' ? ['--yolo'] : []); + return { cmd: this.bin, args, input: prompt }; + }, +}; + +// --- codex ------------------------------------------------------------------ +// `codex exec` is the non-interactive entrypoint. `--full-auto` lets it run +// commands/edits without prompting. In a throwaway trial workspace this is safe. +const codex = { + name: 'codex', + bin: process.env.CODEX_CMD || 'codex', + available() { return which(this.bin); }, + spawnSpec({ mode, prompt }) { + const base = envArgs('CODEX_ARGS') || (mode === 'run' ? ['exec', '--full-auto', '-'] : ['exec', '-']); + return { cmd: this.bin, args: base, input: prompt }; + }, +}; + +// --- claude (also useful as a real third driver / control) ------------------ +const claude = { + name: 'claude', + bin: process.env.CLAUDE_CMD || 'claude', + available() { return which(this.bin); }, + spawnSpec({ mode, prompt }) { + const args = envArgs('CLAUDE_ARGS') || + (mode === 'run' ? ['-p', '--permission-mode', 'acceptEdits'] : ['-p']); + return { cmd: this.bin, args, input: prompt }; + }, +}; + +// --- mock ------------------------------------------------------------------- +// A deterministic stand-in that exercises the FULL coordination protocol +// (tick take → work → tick done/break) without a real model or API key. Lets +// the entire battery run — and the harness be validated — in any environment. +// It reads the parsed spec JSON rather than the prose prompt. +const mock = { + name: 'mock', + available() { return true; }, + spawnSpec({ mode, agent, specPath, workdir, tickBin }) { + return { + cmd: process.execPath, // node + args: [ + path.join(__dirname, 'mock-agent.js'), + '--agent', agent, + '--mode', mode, + '--spec', specPath, + '--workdir', workdir, + '--tick', tickBin, + ], + input: null, + }; + }, +}; + +const DRIVERS = { gemini, codex, claude, mock }; + +function getDriver(name) { + const d = DRIVERS[name]; + if (!d) throw new Error(`unknown driver: ${name} (known: ${Object.keys(DRIVERS).join(', ')})`); + return d; +} + +module.exports = { DRIVERS, getDriver, which }; diff --git a/experiments/coordination-layer/harness/src/mock-agent.js b/experiments/coordination-layer/harness/src/mock-agent.js new file mode 100755 index 0000000..5721067 --- /dev/null +++ b/experiments/coordination-layer/harness/src/mock-agent.js @@ -0,0 +1,191 @@ +#!/usr/bin/env node +'use strict'; + +// Deterministic mock agent. Stands in for a real Gemini/Codex CLI so the whole +// battery runs without an API key. It speaks the SAME protocol a real agent is +// prompted to use (tick take → work → tick done/break), so a mock run exercises +// the coordination layer and the harness's observability for real — only the +// "writing code" step is simulated. +// +// Modes: +// --mode preflight — print clarifying questions (or NO_QUESTIONS) and exit +// --mode run — loop claiming + completing tasks until none remain +// +// Reads the parsed spec JSON (--spec); talks to the isolated coordination state +// via the tick CLI (--tick) with TICK_REPO_ROOT pointed at --workdir. + +const fs = require('fs'); +const path = require('path'); +const { execFileSync } = require('child_process'); + +function parseArgs(argv) { + const f = {}; + for (let i = 0; i < argv.length; i += 2) f[argv[i].replace(/^--/, '')] = argv[i + 1]; + return f; +} + +const args = parseArgs(process.argv.slice(2)); +const agent = args.agent; +const mode = args.mode || 'run'; +const workdir = path.resolve(args.workdir); +const tickBin = args.tick; +const spec = JSON.parse(fs.readFileSync(args.spec, 'utf8')); +const tasksById = new Map(spec.tasks.map(t => [t.id, t])); +const maxAttempts = Number(process.env.TRIAL_MAX_ATTEMPTS || 3); + +const tickEnv = { ...process.env, TICK_REPO_ROOT: workdir, TICK_AGENT: agent }; + +// The claim lock (.tick/locks/claim.lock) is shared across all agents, so two +// agents calling `tick take` at the same instant collide — the loser gets a +// "lock held" error. That is expected; retry with a short backoff. A real agent +// is told to do the same in the integration prompt. +function tick(argsArr, { retries = 8 } = {}) { + for (let attempt = 0; ; attempt++) { + try { + return execFileSync(process.execPath, [tickBin, ...argsArr], { + cwd: workdir, env: tickEnv, encoding: 'utf8', stdio: ['ignore', 'pipe', 'pipe'], + }).trim(); + } catch (err) { + const msg = (err.stderr || '') + (err.message || ''); + if (/lock held|claim is in progress/i.test(msg) && attempt < retries) { + sleep(60 + attempt * 40); + continue; + } + throw err; + } + } +} + +function say(msg) { + process.stdout.write(`[${agent}] ${msg}\n`); +} + +// A small, deterministic-but-nonzero delay so two mock agents running in +// parallel produce genuinely overlapping claim windows (the metric the spike +// cares about). Hash the string for variety without randomness. +function workDelayMs(seed) { + let h = 0; + for (const c of seed) h = (h * 31 + c.charCodeAt(0)) >>> 0; + return 250 + (h % 400); +} + +function sleep(ms) { + const end = Date.now() + ms; + while (Date.now() < end) { /* busy-wait keeps it dependency-free + synchronous */ } +} + +// Resolve the concrete files a task should produce/edit. Prefer the explicit +// Files list; otherwise derive a placeholder file from each declared glob. +function targetFiles(task) { + if (task.files && task.files.length) return task.files; + return task.paths.map(g => g.replace(/\/\*\*.*$/, '/MOCK_OUTPUT.txt').replace(/\*+/g, 'mock')); +} + +function ensureFile(rel, contents) { + const abs = path.join(workdir, rel); + fs.mkdirSync(path.dirname(abs), { recursive: true }); + fs.writeFileSync(abs, contents); +} + +// Apply a mock solution mapping "target<=source[,target<=source]" by copying +// each source file over its target. This is how debug scenarios go red→green +// without a real agent. Real drivers never see this field. +function applyMockSolution(task) { + if (!task.mock_solution) return false; + let applied = false; + for (const pair of task.mock_solution.split(',')) { + const [target, source] = pair.split('<=').map(s => s.trim()); + if (!target || !source) continue; + const srcAbs = path.join(workdir, source); + const dstAbs = path.join(workdir, target); + if (fs.existsSync(srcAbs)) { + fs.mkdirSync(path.dirname(dstAbs), { recursive: true }); + fs.copyFileSync(srcAbs, dstAbs); + applied = true; + } + } + return applied; +} + +function runVerify(cmd) { + try { + execFileSync('sh', ['-c', cmd], { cwd: workdir, env: tickEnv, encoding: 'utf8', stdio: 'pipe' }); + return { ok: true }; + } catch (err) { + return { ok: false, output: (err.stdout || '') + (err.stderr || '') }; + } +} + +function doPreflight() { + // A real agent would read the spec and decide if anything is ambiguous. The + // mock emits a deterministic, spec-derived question for one well-known + // ambiguity class (shared/undeclared files) to exercise the human gate, and + // NO_QUESTIONS otherwise. + const firstShared = spec.tasks.find(t => !t.files || !t.files.length); + if (firstShared) { + say(`QUESTION: ${firstShared.id} declares scope \`${firstShared.paths.join(', ')}\` but lists no explicit Files — ` + + `should I treat the whole glob as mine, or are some files shared with another task?`); + } else if (spec.project.kind === 'debug') { + say('QUESTION: should I prioritise making existing tests pass without adding new test files, ' + + 'or may I add regression tests within my declared scope?'); + } else { + say('NO_QUESTIONS'); + } + process.exit(0); +} + +function doRun() { + let completed = 0; + let guard = 0; + while (guard++ < 100) { + const out = tick(['take', '--agent', agent]); + if (out.startsWith('(no available task)')) { say('no available task — standing down'); break; } + if (out.includes('claim limit reached')) { say('at claim cap — finishing current work'); break; } + + const m = out.match(/^won:\s+(TASK-[A-Za-z0-9_-]+)/); + if (!m) { say(`unexpected take output: ${out}`); break; } + const task = tasksById.get(m[1]); + if (!task) { say(`claimed unknown task ${m[1]} — releasing`); tick(['release', m[1], '--agent', agent]); continue; } + + say(`claimed ${task.id} — working`); + sleep(workDelayMs(agent + task.id)); + + // "Edit" the declared files (build scenarios), then apply any solution + // patch (debug scenarios). + for (const rel of targetFiles(task)) { + const abs = path.join(workdir, rel); + if (!fs.existsSync(abs)) { + ensureFile(rel, `// ${task.id} — produced by mock agent ${agent}\n`); + } else { + fs.appendFileSync(abs, `\n// touched by ${agent} for ${task.id}\n`); + } + } + + let attempt = 0; + let verified = !task.verify; // no verify command ⇒ trivially "passes" + let lastOutput = ''; + while (!verified && attempt < maxAttempts) { + attempt++; + applyMockSolution(task); // a real agent debugs; the mock applies the known fix + const r = runVerify(task.verify); + verified = r.ok; + lastOutput = r.output || ''; + if (!verified) sleep(120); + } + + if (verified) { + tick(['done', task.id, '--agent', agent, '--note', `mock-completed in ${attempt} attempt(s)`]); + say(`done ${task.id}`); + completed++; + } else { + const reason = `verify failed after ${maxAttempts} attempts: ${lastOutput.split('\n')[0] || 'unknown'}`; + tick(['break', task.id, '--agent', agent, '--reason', reason]); + say(`circuit-break ${task.id} (${reason})`); + } + } + say(`finished — completed ${completed} task(s)`); + process.exit(0); +} + +if (mode === 'preflight') doPreflight(); +else doRun(); diff --git a/experiments/coordination-layer/harness/src/observe.js b/experiments/coordination-layer/harness/src/observe.js new file mode 100644 index 0000000..da5bc67 --- /dev/null +++ b/experiments/coordination-layer/harness/src/observe.js @@ -0,0 +1,57 @@ +'use strict'; + +// Observability primitives for a trial run. Everything a trial does is recorded +// in three places so a run is fully reconstructable after the fact: +// +// run.jsonl — one structured JSON line per harness event (the spine) +// logs/.log — raw stdout+stderr transcript of each agent process +// report/ — analyze.json / analyze.md / SUMMARY.md (written at end) +// +// No logging library — append-only JSONL + plain files, matching the rest of +// AI-DDTK's conventions. + +const fs = require('fs'); +const path = require('path'); + +class Recorder { + constructor(runDir) { + this.runDir = runDir; + this.jsonlPath = path.join(runDir, 'run.jsonl'); + fs.mkdirSync(runDir, { recursive: true }); + fs.mkdirSync(path.join(runDir, 'logs'), { recursive: true }); + fs.mkdirSync(path.join(runDir, 'report'), { recursive: true }); + } + + // Append one structured event to the run spine and echo a terse line to the + // console so a human watching the terminal sees progress live. + event(type, data = {}) { + const rec = { ts: new Date().toISOString(), type, ...data }; + fs.appendFileSync(this.jsonlPath, JSON.stringify(rec) + '\n'); + const detail = Object.entries(data) + .filter(([k]) => k !== 'verbose') + .map(([k, v]) => `${k}=${typeof v === 'string' ? v : JSON.stringify(v)}`) + .join(' '); + process.stdout.write(` [${rec.ts}] ${type}${detail ? ' ' + detail : ''}\n`); + return rec; + } + + logStreamFor(agent) { + return fs.createWriteStream(path.join(this.runDir, 'logs', `${agent}.log`), { flags: 'a' }); + } + + writeReportFile(name, contents) { + const p = path.join(this.runDir, 'report', name); + fs.writeFileSync(p, contents); + return p; + } + + read() { + if (!fs.existsSync(this.jsonlPath)) return []; + return fs.readFileSync(this.jsonlPath, 'utf8') + .split('\n') + .filter(Boolean) + .map(l => JSON.parse(l)); + } +} + +module.exports = { Recorder }; diff --git a/experiments/coordination-layer/harness/src/preflight.js b/experiments/coordination-layer/harness/src/preflight.js new file mode 100644 index 0000000..1ae338b --- /dev/null +++ b/experiments/coordination-layer/harness/src/preflight.js @@ -0,0 +1,97 @@ +'use strict'; + +// Preflight phase — the "let the agents ask questions before any work" step. +// +// Each participating agent is given the spec + its candidate task list in a +// READ-ONLY prompt and asked to surface clarifying questions (or NO_QUESTIONS). +// We capture each agent's questions, write a combined QUESTIONS.md, and STOP at +// a human gate — the run does not start until a human reviews the questions +// (and, ideally, edits the spec to answer them). `--auto` skips the gate for +// unattended/CI use. + +const fs = require('fs'); +const path = require('path'); +const { getDriver } = require('./drivers'); +const { buildPreflightPrompt } = require('./prompts'); +const { runProc } = require('./proc'); + +// Pull clarifying questions out of an agent's free-text output. We accept a +// loose contract: any line containing "QUESTION:" is a question; a line that is +// exactly NO_QUESTIONS (case-insensitive) means none. +function extractQuestions(output) { + const lines = output.split('\n'); + const questions = lines + .filter(l => /question:/i.test(l)) + .map(l => l.replace(/^.*?question:\s*/i, '').trim()) + .filter(Boolean); + const explicitNone = lines.some(l => /^\s*\[?[\w-]*\]?\s*no_questions\s*$/i.test(l)); + return { questions, explicitNone }; +} + +async function runPreflight({ rec, spec, agents, workdir, tickBin, specPath, timeoutMs }) { + rec.event('preflight.start', { agents: agents.map(a => `${a.id}:${a.driver}`).join(',') }); + const perAgent = {}; + + for (const agentSpec of agents) { + const driver = getDriver(agentSpec.driver); + const prompt = buildPreflightPrompt({ + agent: agentSpec.id, + project: spec.project, + tasks: spec.tasks, + workdir, + }); + const ss = driver.spawnSpec({ + mode: 'preflight', + agent: agentSpec.id, + prompt, + workdir, + specPath, + tickBin, + }); + const logStream = rec.logStreamFor(`preflight-${agentSpec.id}`); + const result = await runProc({ + cmd: ss.cmd, args: ss.args, input: ss.input, + cwd: workdir, env: { ...process.env, TICK_REPO_ROOT: workdir }, + logStream, timeoutMs, + }); + logStream.end(); + + const { questions, explicitNone } = extractQuestions(result.output); + perAgent[agentSpec.id] = { driver: driver.name, questions, explicitNone, exit: result.code }; + rec.event('preflight.agent', { + agent: agentSpec.id, driver: driver.name, + questions: questions.length, exit: result.code, + }); + } + + const md = renderQuestionsMd(spec, perAgent); + const qPath = path.join(rec.runDir, 'preflight', 'QUESTIONS.md'); + fs.mkdirSync(path.dirname(qPath), { recursive: true }); + fs.writeFileSync(qPath, md); + rec.event('preflight.end', { questions_file: path.relative(rec.runDir, qPath) }); + + const totalQuestions = Object.values(perAgent).reduce((n, a) => n + a.questions.length, 0); + return { perAgent, totalQuestions, questionsPath: qPath }; +} + +function renderQuestionsMd(spec, perAgent) { + const out = []; + out.push(`# Preflight questions — ${spec.project.name}`); + out.push(''); + out.push('> Generated by the trial harness preflight phase. Review these, edit the'); + out.push('> project spec to answer anything material, then start the run.'); + out.push(''); + for (const [agent, data] of Object.entries(perAgent)) { + out.push(`## ${agent} (${data.driver})`); + out.push(''); + if (data.questions.length === 0) { + out.push(data.explicitNone ? '- _No questions._' : '- _No questions detected in output._'); + } else { + for (const q of data.questions) out.push(`- [ ] ${q}`); + } + out.push(''); + } + return out.join('\n'); +} + +module.exports = { runPreflight, extractQuestions }; diff --git a/experiments/coordination-layer/harness/src/proc.js b/experiments/coordination-layer/harness/src/proc.js new file mode 100644 index 0000000..39eee10 --- /dev/null +++ b/experiments/coordination-layer/harness/src/proc.js @@ -0,0 +1,49 @@ +'use strict'; + +// Spawn one agent process from a driver spawn-spec, feed the prompt on stdin, +// and tee stdout+stderr both to a capture buffer and (optionally) to a log +// stream. Resolves with { code, signal, output, ms }. Never rejects — a +// non-zero exit is data, not an exception, for a trial. + +const { spawn } = require('child_process'); + +function runProc({ cmd, args, input, cwd, env, logStream, timeoutMs }) { + return new Promise((resolve) => { + const started = Date.now(); + const child = spawn(cmd, args, { cwd, env, stdio: ['pipe', 'pipe', 'pipe'] }); + let output = ''; + let timedOut = false; + + const onData = (buf) => { + const s = buf.toString(); + output += s; + if (logStream) logStream.write(s); + }; + child.stdout.on('data', onData); + child.stderr.on('data', onData); + + let timer = null; + if (timeoutMs) { + timer = setTimeout(() => { timedOut = true; child.kill('SIGKILL'); }, timeoutMs); + } + + child.on('error', (err) => { + output += `\n[spawn error] ${err.message}\n`; + if (logStream) logStream.write(`\n[spawn error] ${err.message}\n`); + if (timer) clearTimeout(timer); + resolve({ code: -1, signal: null, output, ms: Date.now() - started, spawnError: err.message }); + }); + + child.on('close', (code, signal) => { + if (timer) clearTimeout(timer); + resolve({ code, signal, output, ms: Date.now() - started, timedOut }); + }); + + if (input != null) { + child.stdin.write(input); + } + child.stdin.end(); + }); +} + +module.exports = { runProc }; diff --git a/experiments/coordination-layer/harness/src/prompts.js b/experiments/coordination-layer/harness/src/prompts.js new file mode 100644 index 0000000..061bbf5 --- /dev/null +++ b/experiments/coordination-layer/harness/src/prompts.js @@ -0,0 +1,74 @@ +'use strict'; + +// Build the prompts handed to each agent CLI. Templates live in ../prompts/ +// and use {{PLACEHOLDER}} markers. Keeping the prose in markdown files (not +// string literals) means a human can tune the integration prompt without +// touching code — the single biggest lever on real-agent compliance per the +// Run 1/2 retros. + +const fs = require('fs'); +const path = require('path'); + +const PROMPT_DIR = path.join(__dirname, '..', 'prompts'); + +function render(template, vars) { + return template.replace(/\{\{(\w+)\}\}/g, (_, key) => + key in vars ? String(vars[key]) : `{{${key}}}` + ); +} + +function taskTable(tasks) { + const rows = tasks.map(t => + `| ${t.id} | ${t.priority} | \`${t.paths.join('`, `')}\` | ${t.title || t.description.slice(0, 60)} |` + ); + return [ + '| Task | Priority | Declared scope | Summary |', + '| --- | --- | --- | --- |', + ...rows, + ].join('\n'); +} + +function taskDetails(tasks) { + return tasks.map(t => { + const lines = [`### ${t.id} — ${t.title}`.trim()]; + lines.push(`- Scope: \`${t.paths.join('`, `')}\``); + if (t.files.length) lines.push(`- Files: \`${t.files.join('`, `')}\``); + if (t.depends_on.length) lines.push(`- Depends on (contract only): ${t.depends_on.join(', ')}`); + lines.push(`- Description: ${t.description}`); + lines.push(`- Acceptance: ${t.acceptance}`); + if (t.verify) lines.push(`- Verify: \`${t.verify}\``); + return lines.join('\n'); + }).join('\n\n'); +} + +function buildAgentPrompt({ agent, project, tasks, tickCmd, workdir, maxAttempts }) { + const template = fs.readFileSync(path.join(PROMPT_DIR, 'agent-loop.md'), 'utf8'); + return render(template, { + AGENT: agent, + PROJECT_NAME: project.name, + PROJECT_GOAL: project.goal, + PROJECT_KIND: project.kind, + MAX_CLAIMS: project.max_active_claims_per_agent, + MAX_ATTEMPTS: maxAttempts, + TICK: tickCmd, + WORKDIR: workdir, + TASK_TABLE: taskTable(tasks), + TASK_DETAILS: taskDetails(tasks), + PROJECT_VERIFY: project.verify || '(none — rely on per-task verify / acceptance)', + }); +} + +function buildPreflightPrompt({ agent, project, tasks, workdir }) { + const template = fs.readFileSync(path.join(PROMPT_DIR, 'preflight.md'), 'utf8'); + return render(template, { + AGENT: agent, + PROJECT_NAME: project.name, + PROJECT_GOAL: project.goal, + PROJECT_KIND: project.kind, + WORKDIR: workdir, + TASK_TABLE: taskTable(tasks), + TASK_DETAILS: taskDetails(tasks), + }); +} + +module.exports = { buildAgentPrompt, buildPreflightPrompt, render, taskTable, taskDetails }; diff --git a/experiments/coordination-layer/harness/src/run.js b/experiments/coordination-layer/harness/src/run.js new file mode 100644 index 0000000..3d66347 --- /dev/null +++ b/experiments/coordination-layer/harness/src/run.js @@ -0,0 +1,204 @@ +'use strict'; + +// Trial orchestrator. Given a parsed spec and a set of {id, driver} agents: +// +// 1. build an isolated workspace (its own throwaway git repo + .tick/ state) +// 2. copy any fixture in, drop a `./tick` shim, init + seed the task backlog +// 3. spawn every agent CLI concurrently in run mode, capturing transcripts +// 4. run the project verify command (if any) +// 5. run `tick analyze` and write a human SUMMARY.md +// +// The real repo's `.tick/` is never touched — TICK_REPO_ROOT points at the +// per-run workspace. That isolation is what makes the battery safe to run +// repeatedly and in parallel. + +const fs = require('fs'); +const path = require('path'); +const { execFileSync } = require('child_process'); +const { getDriver } = require('./drivers'); +const { buildAgentPrompt } = require('./prompts'); +const { runProc } = require('./proc'); + +const TICK_BIN = path.join(__dirname, '..', '..', 'bin', 'tick'); + +function tick(workspace, argsArr) { + return execFileSync(process.execPath, [TICK_BIN, ...argsArr], { + cwd: workspace, + env: { ...process.env, TICK_REPO_ROOT: workspace }, + encoding: 'utf8', + }).trim(); +} + +function copyDir(src, dst) { + fs.mkdirSync(dst, { recursive: true }); + for (const entry of fs.readdirSync(src, { withFileTypes: true })) { + const s = path.join(src, entry.name); + const d = path.join(dst, entry.name); + if (entry.isDirectory()) copyDir(s, d); + else fs.copyFileSync(s, d); + } +} + +function prepareWorkspace({ rec, spec, specDir }) { + const workspace = path.join(rec.runDir, 'workspace'); + fs.mkdirSync(workspace, { recursive: true }); + + // Throwaway git repo so any git-touching tick path is harmless (no remote ⇒ + // nothing is pushed) and so a real agent has a clean repo to work in. + const gitEnv = { + ...process.env, + GIT_AUTHOR_NAME: 'trial', GIT_AUTHOR_EMAIL: 'trial@local', + GIT_COMMITTER_NAME: 'trial', GIT_COMMITTER_EMAIL: 'trial@local', + }; + execFileSync('git', ['init', '-q'], { cwd: workspace, env: gitEnv }); + execFileSync('git', ['config', 'user.name', 'trial'], { cwd: workspace }); + execFileSync('git', ['config', 'user.email', 'trial@local'], { cwd: workspace }); + + // Copy fixture (seeded-bug code for debug scenarios), if declared. + if (spec.project.fixture) { + const fixtureSrc = path.resolve(specDir, spec.project.fixture); + if (!fs.existsSync(fixtureSrc)) throw new Error(`fixture not found: ${fixtureSrc}`); + copyDir(fixtureSrc, workspace); + rec.event('workspace.fixture', { from: path.relative(process.cwd(), fixtureSrc) }); + } + + // `./tick` shim so agent prompts can call the CLI without an absolute path. + const shim = workspace + '/tick'; + fs.writeFileSync(shim, `#!/bin/sh\nexec "${process.execPath}" "${TICK_BIN}" "$@"\n`); + fs.chmodSync(shim, 0o755); + + // Initialise coordination state and seed the backlog from the spec. + tick(workspace, ['init']); + for (const t of spec.tasks) { + tick(workspace, [ + 'log', 'task.created', t.id, + '--agent', 'dispatcher', + '--priority', String(t.priority), + '--paths', t.paths.join(','), + ]); + rec.event('seed.task', { task: t.id, priority: t.priority, paths: t.paths.join(',') }); + } + + // Commit the seeded workspace so the agent starts from a clean tree. + execFileSync('git', ['add', '-A'], { cwd: workspace, env: gitEnv }); + execFileSync('git', ['commit', '-q', '-m', 'trial: seed workspace'], { cwd: workspace, env: gitEnv }); + + return workspace; +} + +async function runTrial({ rec, spec, specPath, agents, workspace, timeoutMs, maxAttempts }) { + rec.event('trial.start', { + project: spec.project.name, kind: spec.project.kind, + tasks: spec.tasks.length, agents: agents.map(a => `${a.id}:${a.driver}`).join(','), + }); + + // Spawn every agent concurrently. Each runs the integration prompt in its + // driver's headless mode; the mock driver ignores the prose and reads the + // spec JSON, but every driver gets the same env + cwd. + const agentRuns = agents.map(async (agentSpec) => { + const driver = getDriver(agentSpec.driver); + const prompt = buildAgentPrompt({ + agent: agentSpec.id, + project: spec.project, + tasks: spec.tasks, + tickCmd: './tick', + workdir: workspace, + maxAttempts, + }); + fs.writeFileSync(path.join(rec.runDir, `prompt-${agentSpec.id}.md`), prompt); + + const ss = driver.spawnSpec({ + mode: 'run', agent: agentSpec.id, prompt, workdir: workspace, specPath, tickBin: TICK_BIN, + }); + const logStream = rec.logStreamFor(agentSpec.id); + rec.event('agent.spawn', { agent: agentSpec.id, driver: driver.name, cmd: `${ss.cmd} ${ss.args.join(' ')}` }); + + const result = await runProc({ + cmd: ss.cmd, args: ss.args, input: ss.input, + cwd: workspace, + env: { ...process.env, TICK_REPO_ROOT: workspace, TICK_AGENT: agentSpec.id, TRIAL_MAX_ATTEMPTS: String(maxAttempts) }, + logStream, timeoutMs, + }); + logStream.end(); + rec.event('agent.exit', { + agent: agentSpec.id, code: result.code, ms: result.ms, + timedOut: !!result.timedOut, spawnError: result.spawnError || null, + }); + return { agent: agentSpec.id, ...result }; + }); + + const agentResults = await Promise.all(agentRuns); + + // Project verify (does the integrated result actually build/pass?). + let verify = null; + if (spec.project.verify) { + try { + const out = execFileSync('sh', ['-c', spec.project.verify], { cwd: workspace, encoding: 'utf8', stdio: 'pipe' }); + verify = { ok: true, output: out }; + } catch (err) { + verify = { ok: false, output: (err.stdout || '') + (err.stderr || '') }; + } + rec.event('verify.result', { ok: verify.ok }); + } + + // Analyze the coordination event log. + const analyzeJson = JSON.parse(tick(workspace, ['analyze', '--format', 'json'])); + const analyzeMd = tick(workspace, ['analyze', '--format', 'md']); + rec.writeReportFile('analyze.json', JSON.stringify(analyzeJson, null, 2)); + rec.writeReportFile('analyze.md', analyzeMd + '\n'); + + const summary = renderSummary({ spec, agents, agentResults, analyzeJson, verify, workspace }); + rec.writeReportFile('SUMMARY.md', summary); + rec.event('trial.end', { + completed: analyzeJson.event_counts.done, + broken: analyzeJson.event_counts.circuit_break, + concurrent_pct: analyzeJson.parallelism.concurrent_pct, + verify: verify ? verify.ok : 'n/a', + }); + + return { workspace, analyzeJson, verify, agentResults, summary }; +} + +function renderSummary({ spec, agents, agentResults, analyzeJson, verify }) { + const p = analyzeJson.parallelism; + const out = []; + out.push(`# Trial summary — ${spec.project.name}`); + out.push(''); + out.push(`- **Kind:** ${spec.project.kind}`); + out.push(`- **Agents:** ${agents.map(a => `${a.id} (${a.driver})`).join(', ')}`); + out.push(`- **Tasks seeded:** ${spec.tasks.length}`); + out.push(`- **Completed (\`tick done\`):** ${analyzeJson.event_counts.done}`); + out.push(`- **Circuit-broken:** ${analyzeJson.event_counts.circuit_break}`); + out.push(`- **Concurrent-claim time:** ${p.concurrent_pct == null ? 'n/a' : p.concurrent_pct + '%'} ` + + `(target ≥ 50%)`); + if (verify) out.push(`- **Project verify:** ${verify.ok ? '✅ pass' : '❌ fail'}`); + out.push(''); + out.push('## Per-agent process result'); + out.push(''); + out.push('| Agent | Driver | Exit | Wall ms | Notes |'); + out.push('| --- | --- | --- | --- | --- |'); + for (const r of agentResults) { + const a = agents.find(x => x.id === r.agent); + const notes = r.spawnError ? `spawn error: ${r.spawnError}` : (r.timedOut ? 'timed out' : 'ok'); + out.push(`| ${r.agent} | ${a.driver} | ${r.code} | ${r.ms} | ${notes} |`); + } + out.push(''); + out.push('## Coordination analysis'); + out.push(''); + out.push('See `report/analyze.md` for the full `tick analyze` output. Per-agent claim/done counts:'); + out.push(''); + for (const ag of analyzeJson.agents) { + out.push(`- **${ag.agent}** — claimed ${ag.claims}, done ${ag.dones}, broken ${ag.breaks}, released ${ag.releases}`); + } + out.push(''); + out.push('## Verdict'); + out.push(''); + const ok = (!verify || verify.ok) && analyzeJson.event_counts.done > 0; + out.push(ok + ? '✅ Tasks completed through the coordination protocol with no protocol errors.' + : '⚠️ Review needed — see broken tasks and/or failed verify above.'); + out.push(''); + return out.join('\n'); +} + +module.exports = { runTrial, prepareWorkspace, TICK_BIN }; diff --git a/experiments/coordination-layer/harness/src/spec.js b/experiments/coordination-layer/harness/src/spec.js new file mode 100644 index 0000000..7b3d9cf --- /dev/null +++ b/experiments/coordination-layer/harness/src/spec.js @@ -0,0 +1,201 @@ +'use strict'; + +// Deterministic project-spec parser — this is "stage 1" of the ingestion +// pipeline scaffolded in ../../ingestion/README.md, finally implemented. +// +// Input: a human-authored PROJECT-SPEC markdown file (see PROJECT-SPEC.template.md). +// Output: a structured { project, tasks[] } object, validated. +// +// Pure and deterministic: the same markdown always yields the same structure. +// No LLM, no network. Hard-fails (throws) on structural problems so a bad spec +// never silently becomes a malformed task list. + +const fs = require('fs'); + +// Parse top-level `**Key:** value` metadata lines that appear before the first +// `## ` heading. Keys are lowercased and spaces collapsed to underscores. +function parseProjectMeta(lines) { + const meta = {}; + for (const line of lines) { + if (/^##+\s/.test(line)) break; // stop at the first `##` section (Constraints, Sub-tasks, …) + const m = line.match(/^\s*[-*]?\s*\*\*([^:*]+):\*\*\s*(.*)$/); + if (m) { + const key = m[1].trim().toLowerCase().replace(/[^a-z0-9]+/g, '_').replace(/^_|_$/g, ''); + meta[key] = m[2].trim(); + } + } + return meta; +} + +// Split the document into the preamble (everything before the first +// `### TASK-` block) and the task blocks themselves. +function splitTaskBlocks(text) { + const lines = text.split(/\r?\n/); + const blocks = []; + let current = null; + const preamble = []; + const headerRe = /^###\s+(TASK-[A-Za-z0-9_-]+)\b\s*(?:[—\-:]\s*(.*))?$/; + + for (const line of lines) { + const m = line.match(headerRe); + if (m) { + if (current) blocks.push(current); + current = { id: m[1], title: (m[2] || '').trim(), body: [] }; + } else if (current) { + current.body.push(line); + } else { + preamble.push(line); + } + } + if (current) blocks.push(current); + return { preamble, blocks }; +} + +// Parse `- **Key:** value` bullets inside a task block into a flat map. +function parseTaskBullets(bodyLines) { + const fields = {}; + for (const line of bodyLines) { + const m = line.match(/^\s*[-*]\s*\*\*([^:*]+):\*\*\s*(.*)$/); + if (m) { + const key = m[1].trim().toLowerCase().replace(/[^a-z0-9]+/g, '_').replace(/^_|_$/g, ''); + fields[key] = m[2].trim(); + } + } + return fields; +} + +function parseGlobList(v) { + if (!v) return []; + return String(v) + .replace(/`/g, '') + .split(',') + .map(s => s.trim()) + .filter(Boolean) + .filter(s => s.toLowerCase() !== 'none'); +} + +function parseDeps(v) { + if (!v) return []; + return String(v) + .replace(/`/g, '') + .split(',') + .map(s => s.trim()) + .filter(Boolean) + .filter(s => s.toLowerCase() !== 'none'); +} + +function parseSpec(markdownText) { + const { preamble, blocks } = splitTaskBlocks(markdownText); + + // Project title = first `# ` heading. + const titleLine = preamble.find(l => /^#\s+/.test(l)); + const title = titleLine ? titleLine.replace(/^#\s+/, '').replace(/^Project:\s*/i, '').trim() : 'untitled'; + + const meta = parseProjectMeta(preamble); + + const project = { + name: title, + goal: meta.goal || '', + branch: meta.branch || '', + path_scoping_strategy: meta.path_scoping_strategy || meta.path_scoping || 'unspecified', + max_active_claims_per_agent: meta.max_active_claims_per_agent + ? Number(meta.max_active_claims_per_agent) + : 2, + agents: (meta.agents || '') + .split(',') + .map(s => s.trim().toLowerCase()) + .filter(Boolean), + fixture: meta.fixture || null, + verify: meta.verify || null, + kind: (meta.kind || (meta.fixture ? 'debug' : 'build')).toLowerCase(), + }; + + const tasks = blocks.map(b => { + const f = parseTaskBullets(b.body); + return { + id: b.id, + title: b.title, + paths: parseGlobList(f.declared_scope || f.scope || f.paths), + files: parseGlobList(f.files), + priority: f.priority !== undefined && f.priority !== '' ? Number(f.priority) : 5, + depends_on: parseDeps(f.depends_on_contract_only || f.depends_on || f.depends), + description: f.description || '', + acceptance: f.acceptance || '', + verify: f.verify || null, + // Mock-only: a path (relative to the fixture/workspace) whose contents the + // mock driver copies in to "solve" the task so debug scenarios go red→green + // without a real agent. Real drivers ignore this field entirely. + mock_solution: f.mock_solution || null, + }; + }); + + validate(project, tasks); + return { project, tasks }; +} + +function validate(project, tasks) { + const errors = []; + + if (!tasks.length) errors.push('no `### TASK-` blocks found'); + + const seen = new Set(); + for (const t of tasks) { + if (seen.has(t.id)) errors.push(`duplicate task id: ${t.id}`); + seen.add(t.id); + if (!t.paths.length) errors.push(`${t.id}: empty declared scope (every task needs at least one path glob)`); + if (!Number.isFinite(t.priority)) errors.push(`${t.id}: non-numeric priority`); + if (!t.description) errors.push(`${t.id}: missing description`); + if (!t.acceptance) errors.push(`${t.id}: missing acceptance criteria`); + } + + // Dependency edges must reference real task ids, and must be acyclic. + for (const t of tasks) { + for (const dep of t.depends_on) { + if (!seen.has(dep)) errors.push(`${t.id}: depends on unknown task ${dep}`); + } + } + const cycle = findCycle(tasks); + if (cycle) errors.push(`dependency cycle: ${cycle.join(' → ')}`); + + if (errors.length) { + throw new Error('spec validation failed:\n - ' + errors.join('\n - ')); + } +} + +function findCycle(tasks) { + const graph = new Map(tasks.map(t => [t.id, t.depends_on])); + const WHITE = 0, GRAY = 1, BLACK = 2; + const color = new Map(tasks.map(t => [t.id, WHITE])); + const stack = []; + + function dfs(id) { + color.set(id, GRAY); + stack.push(id); + for (const dep of graph.get(id) || []) { + if (!color.has(dep)) continue; // unknown dep already reported + if (color.get(dep) === GRAY) return stack.slice(stack.indexOf(dep)).concat(dep); + if (color.get(dep) === WHITE) { + const c = dfs(dep); + if (c) return c; + } + } + stack.pop(); + color.set(id, BLACK); + return null; + } + + for (const t of tasks) { + if (color.get(t.id) === WHITE) { + const c = dfs(t.id); + if (c) return c; + } + } + return null; +} + +function parseSpecFile(filePath) { + const text = fs.readFileSync(filePath, 'utf8'); + return parseSpec(text); +} + +module.exports = { parseSpec, parseSpecFile, validate, findCycle }; diff --git a/experiments/coordination-layer/harness/test/smoke.sh b/experiments/coordination-layer/harness/test/smoke.sh new file mode 100755 index 0000000..8cce173 --- /dev/null +++ b/experiments/coordination-layer/harness/test/smoke.sh @@ -0,0 +1,52 @@ +#!/usr/bin/env bash +# Smoke test: drive the entire battery with the deterministic MOCK driver, so +# the harness is validated end-to-end with no API keys or real CLIs. Exercises +# spec parse → preflight → seed → concurrent run → analyze → report for every +# trial spec, plus the debug red→green path and the circuit-breaker. +set -euo pipefail + +HARNESS="$(cd "$(dirname "$0")/.." && pwd)" +TRIAL="node $HARNESS/bin/trial" +PASS=0; FAIL=0 + +note() { printf '\n=== %s ===\n' "$1"; } +ok() { printf ' ✓ %s\n' "$1"; PASS=$((PASS+1)); } +bad() { printf ' ✗ %s\n' "$1"; FAIL=$((FAIL+1)); } + +note "validate every spec" +for spec in "$HARNESS"/trials/*.project.md; do + if $TRIAL validate "$spec" >/dev/null; then ok "valid: $(basename "$spec")"; else bad "invalid: $(basename "$spec")"; fi +done + +note "doctor" +$TRIAL doctor >/dev/null && ok "doctor ran" + +# Helper: run a spec with both agents forced onto the mock driver, auto past the +# preflight gate, and echo the resulting run dir. +run_mock() { + local spec="$1" out + out=$(mktemp) + $TRIAL run "$spec" --agents gemini:mock,codex:mock --auto --timeout 60000 >"$out" 2>&1 || true + echo "$HARNESS/$(grep -oE 'runs/[^ ]+' "$out" | head -1)" +} + +note "build trial: build-todo-api (mock)" +RD=$(run_mock build-todo-api) +if grep -q '"done": 6' "$RD/report/analyze.json"; then ok "all 6 build tasks completed"; else bad "todo-api did not complete 6 tasks"; fi + +note "build trial: build-url-shortener (mock)" +RD=$(run_mock build-url-shortener) +if grep -q '"done": 4' "$RD/report/analyze.json"; then ok "all 4 build tasks completed"; else bad "url-shortener did not complete 4 tasks"; fi + +note "debug trial: calc-bugs (mock, red→green)" +RD=$(run_mock debug-calc-bugs) +if grep -q '"done": 2' "$RD/report/analyze.json"; then ok "both bugs fixed (tick done x2)"; else bad "calc-bugs not both done"; fi +if grep -q '✅ pass' "$RD/report/SUMMARY.md"; then ok "project verify passed (tests green)"; else bad "calc-bugs project verify did not pass"; fi + +note "debug trial: poisoned-task (circuit-breaker)" +RD=$(run_mock debug-poisoned-task) +if grep -q '"circuit_break": 1' "$RD/report/analyze.json"; then ok "poisoned task circuit-broke (bounded, not infinite)"; else bad "poison task did not circuit-break"; fi +if grep -q '"done": 1' "$RD/report/analyze.json"; then ok "the fixable task still completed"; else bad "fixable task not completed"; fi + +printf '\n--- smoke: %d passed, %d failed ---\n' "$PASS" "$FAIL" +[ "$FAIL" -eq 0 ] diff --git a/experiments/coordination-layer/harness/trials/build-todo-api.project.md b/experiments/coordination-layer/harness/trials/build-todo-api.project.md new file mode 100644 index 0000000..b127e46 --- /dev/null +++ b/experiments/coordination-layer/harness/trials/build-todo-api.project.md @@ -0,0 +1,76 @@ +# Project: Todo REST API (build) + +**Goal:** Build a stdlib-only Node.js Todo REST API split into an HTTP layer and a store layer so two agents can build the halves concurrently without colliding. This is the Run 2 fixture shape, re-expressed as a harness-driven trial. +**Branch:** experiment/coordination-layer +**Kind:** build +**Agents:** gemini, codex +**Path-scoping strategy:** half-wide (HTTP half vs store half) +**Max active claims per agent:** 2 + +## Constraints + +- Standard library only — no npm dependencies. +- Do not edit shared files: `package.json`, lockfiles. +- HTTP and store halves communicate only through the documented store contract. + +## Interface contracts (if halves integrate) + +The HTTP layer calls the store through a documented interface (`create`, `get`, +`list`, `update`, `remove`). Tasks depend only on that contract, never on each +other's code. + +## Sub-tasks + +### TASK-A1 — HTTP server bootstrap + +- **Declared scope:** `src/http/**` +- **Files:** `src/http/server.js` +- **Priority:** 10 +- **Depends on (contract only):** none +- **Description:** Create an http.createServer bootstrap that wires a router and listens on a configurable port. +- **Acceptance:** Server starts and responds 404 on an unknown route. + +### TASK-A2 — HTTP router + +- **Declared scope:** `src/http/**` +- **Files:** `src/http/router.js` +- **Priority:** 8 +- **Depends on (contract only):** none +- **Description:** Method+path router that dispatches to handlers. +- **Acceptance:** Routes GET/POST/PUT/DELETE to the right handler; unknown → 404. + +### TASK-A3 — HTTP handlers + +- **Declared scope:** `src/http/**` +- **Files:** `src/http/handlers.js` +- **Priority:** 5 +- **Depends on (contract only):** TASK-B1 +- **Description:** CRUD handlers that translate HTTP to store calls and JSON responses. +- **Acceptance:** Each handler returns the correct status code and JSON body. + +### TASK-B1 — Store core + +- **Declared scope:** `src/store/**` +- **Files:** `src/store/store.js` +- **Priority:** 10 +- **Depends on (contract only):** none +- **Description:** In-memory store implementing the documented create/get/list/update/remove contract. +- **Acceptance:** All five operations behave per contract. + +### TASK-B2 — Store validation + +- **Declared scope:** `src/store/**` +- **Files:** `src/store/validate.js` +- **Priority:** 8 +- **Depends on (contract only):** none +- **Description:** Input validation for todo records (title required, done is boolean). +- **Acceptance:** Invalid records are rejected with a clear error. + +### TASK-B3 — Store query helpers + +- **Declared scope:** `src/store/**` +- **Files:** `src/store/query.js` +- **Priority:** 5 +- **Depends on (contract only):** TASK-B1 +- **Description:** Filtering/sorting helpers over the store (by done flag, by created time). +- **Acceptance:** Query helpers return correctly filtered/sorted lists. diff --git a/experiments/coordination-layer/harness/trials/build-url-shortener.project.md b/experiments/coordination-layer/harness/trials/build-url-shortener.project.md new file mode 100644 index 0000000..e8d1cec --- /dev/null +++ b/experiments/coordination-layer/harness/trials/build-url-shortener.project.md @@ -0,0 +1,51 @@ +# Project: URL shortener (build) + +**Goal:** Build a small URL-shortener library with three non-overlapping concerns — a base62 codec, a persistence layer, and an HTTP facade — to test path-routing on a different shape than the Todo API (three thin halves instead of two). +**Branch:** experiment/coordination-layer +**Kind:** build +**Agents:** gemini, codex +**Path-scoping strategy:** per-concern (codec / store / http) +**Max active claims per agent:** 2 + +## Constraints + +- Standard library only. +- Each concern lives entirely under its own directory; no cross-directory edits. + +## Sub-tasks + +### TASK-CODEC1 — base62 encode/decode + +- **Declared scope:** `src/codec/**` +- **Files:** `src/codec/base62.js` +- **Priority:** 10 +- **Depends on (contract only):** none +- **Description:** Encode an integer id to a base62 short code and decode it back. +- **Acceptance:** decode(encode(n)) === n for a range of n. + +### TASK-STORE1 — link store + +- **Declared scope:** `src/store/**` +- **Files:** `src/store/links.js` +- **Priority:** 9 +- **Depends on (contract only):** none +- **Description:** In-memory map from short code to long URL with put/get. +- **Acceptance:** put then get round-trips a URL; missing code returns null. + +### TASK-HTTP1 — shorten endpoint + +- **Declared scope:** `src/http/**` +- **Files:** `src/http/shorten.js` +- **Priority:** 7 +- **Depends on (contract only):** TASK-CODEC1, TASK-STORE1 +- **Description:** POST handler that stores a URL and returns its short code. +- **Acceptance:** Returns 201 with a code that resolves back to the URL. + +### TASK-HTTP2 — redirect endpoint + +- **Declared scope:** `src/http/**` +- **Files:** `src/http/redirect.js` +- **Priority:** 6 +- **Depends on (contract only):** TASK-STORE1 +- **Description:** GET handler that 302-redirects a short code to its long URL. +- **Acceptance:** Known code → 302 with Location; unknown code → 404. diff --git a/experiments/coordination-layer/harness/trials/debug-calc-bugs.project.md b/experiments/coordination-layer/harness/trials/debug-calc-bugs.project.md new file mode 100644 index 0000000..8eba7ed --- /dev/null +++ b/experiments/coordination-layer/harness/trials/debug-calc-bugs.project.md @@ -0,0 +1,39 @@ +# Project: Calc library bug-fix (debug) + +**Goal:** A tiny calc library ships two seeded bugs (add subtracts, mul adds). Each bug is an independent, path-isolated debug task with a failing test that must go green. Tests the coordination layer on a debug/fix-iterate workload rather than greenfield build. +**Branch:** experiment/coordination-layer +**Kind:** debug +**Agents:** gemini, codex +**Fixture:** fixtures/calc-bugs +**Path-scoping strategy:** per-file +**Max active claims per agent:** 2 +**Verify:** node --test test/*.test.js + +## Constraints + +- Fix only the implementation under `src/`; do not weaken the tests. +- Each task owns exactly one source file — no cross-file edits. + +## Sub-tasks + +### TASK-ADD — fix add() + +- **Declared scope:** `src/add.js, test/add.test.js` +- **Files:** `src/add.js` +- **Priority:** 10 +- **Depends on (contract only):** none +- **Description:** `src/add.js` subtracts instead of adding. Make `test/add.test.js` pass. +- **Acceptance:** `node --test test/add.test.js` exits 0. +- **Verify:** node --test test/add.test.js +- **Mock-solution:** src/add.js<=.solutions/add.js + +### TASK-MUL — fix mul() + +- **Declared scope:** `src/mul.js, test/mul.test.js` +- **Files:** `src/mul.js` +- **Priority:** 8 +- **Depends on (contract only):** none +- **Description:** `src/mul.js` adds instead of multiplying. Make `test/mul.test.js` pass. +- **Acceptance:** `node --test test/mul.test.js` exits 0. +- **Verify:** node --test test/mul.test.js +- **Mock-solution:** src/mul.js<=.solutions/mul.js diff --git a/experiments/coordination-layer/harness/trials/debug-poisoned-task.project.md b/experiments/coordination-layer/harness/trials/debug-poisoned-task.project.md new file mode 100644 index 0000000..d4f03bb --- /dev/null +++ b/experiments/coordination-layer/harness/trials/debug-poisoned-task.project.md @@ -0,0 +1,38 @@ +# Project: Poisoned-task debug (circuit-breaker) + +**Goal:** One genuinely fixable bug alongside one contradictory, unsatisfiable task. Tests that an agent fixes what it can AND that the poisoned task triggers a bounded circuit-break instead of burning unlimited fix-iterate attempts. +**Branch:** experiment/coordination-layer +**Kind:** debug +**Agents:** gemini, codex +**Fixture:** fixtures/poison +**Path-scoping strategy:** per-file +**Max active claims per agent:** 2 +**Verify:** node --test test/*.test.js + +## Constraints + +- The poisoned task has no valid solution by construction — do not weaken tests + or fake success to "pass" it. Circuit-break it after a bounded number of tries. + +## Sub-tasks + +### TASK-FIX — fix parse() + +- **Declared scope:** `src/parse.js, test/parse.test.js` +- **Files:** `src/parse.js` +- **Priority:** 10 +- **Depends on (contract only):** none +- **Description:** `src/parse.js` splits on `:` but the contract is `key=value`. Make `test/parse.test.js` pass. +- **Acceptance:** `node --test test/parse.test.js` exits 0. +- **Verify:** node --test test/parse.test.js +- **Mock-solution:** src/parse.js<=.solutions/parse.js + +### TASK-POISON — satisfy contradictory spec + +- **Declared scope:** `src/impossible/**` +- **Files:** `src/impossible/thing.js` +- **Priority:** 6 +- **Depends on (contract only):** none +- **Description:** Make `value()` return both 3 and 4 for the same input at the same time. This is logically impossible and exists to exercise the circuit-breaker. +- **Acceptance:** Unsatisfiable by construction — the verify command always fails. +- **Verify:** sh -c 'echo "contradictory requirement: value() cannot return 3 and 4 at once" >&2; exit 1' diff --git a/experiments/coordination-layer/harness/trials/fixtures/calc-bugs/.solutions/add.js b/experiments/coordination-layer/harness/trials/fixtures/calc-bugs/.solutions/add.js new file mode 100644 index 0000000..4e81f85 --- /dev/null +++ b/experiments/coordination-layer/harness/trials/fixtures/calc-bugs/.solutions/add.js @@ -0,0 +1,3 @@ +'use strict'; +// Reference fix applied by the mock driver (a real agent debugs to this itself). +module.exports = (a, b) => a + b; diff --git a/experiments/coordination-layer/harness/trials/fixtures/calc-bugs/.solutions/mul.js b/experiments/coordination-layer/harness/trials/fixtures/calc-bugs/.solutions/mul.js new file mode 100644 index 0000000..365097c --- /dev/null +++ b/experiments/coordination-layer/harness/trials/fixtures/calc-bugs/.solutions/mul.js @@ -0,0 +1,3 @@ +'use strict'; +// Reference fix applied by the mock driver (a real agent debugs to this itself). +module.exports = (a, b) => a * b; diff --git a/experiments/coordination-layer/harness/trials/fixtures/calc-bugs/package.json b/experiments/coordination-layer/harness/trials/fixtures/calc-bugs/package.json new file mode 100644 index 0000000..f8bde1c --- /dev/null +++ b/experiments/coordination-layer/harness/trials/fixtures/calc-bugs/package.json @@ -0,0 +1,9 @@ +{ + "name": "calc-bugs-fixture", + "version": "0.0.0", + "private": true, + "description": "Seeded-bug fixture for the debug-calc-bugs trial. src/ contains deliberately wrong implementations; test/ encodes the correct behavior; .solutions/ holds the fix the mock driver applies.", + "scripts": { + "test": "node --test" + } +} diff --git a/experiments/coordination-layer/harness/trials/fixtures/calc-bugs/src/add.js b/experiments/coordination-layer/harness/trials/fixtures/calc-bugs/src/add.js new file mode 100644 index 0000000..06192f8 --- /dev/null +++ b/experiments/coordination-layer/harness/trials/fixtures/calc-bugs/src/add.js @@ -0,0 +1,4 @@ +'use strict'; +// SEEDED BUG: subtracts instead of adding. The agent's task is to fix this so +// test/add.test.js passes. +module.exports = (a, b) => a - b; diff --git a/experiments/coordination-layer/harness/trials/fixtures/calc-bugs/src/mul.js b/experiments/coordination-layer/harness/trials/fixtures/calc-bugs/src/mul.js new file mode 100644 index 0000000..b57cf71 --- /dev/null +++ b/experiments/coordination-layer/harness/trials/fixtures/calc-bugs/src/mul.js @@ -0,0 +1,4 @@ +'use strict'; +// SEEDED BUG: adds instead of multiplying. The agent's task is to fix this so +// test/mul.test.js passes. +module.exports = (a, b) => a + b; diff --git a/experiments/coordination-layer/harness/trials/fixtures/calc-bugs/test/add.test.js b/experiments/coordination-layer/harness/trials/fixtures/calc-bugs/test/add.test.js new file mode 100644 index 0000000..890c91d --- /dev/null +++ b/experiments/coordination-layer/harness/trials/fixtures/calc-bugs/test/add.test.js @@ -0,0 +1,9 @@ +'use strict'; +const test = require('node:test'); +const assert = require('node:assert'); +const add = require('../src/add'); + +test('add sums its arguments', () => { + assert.strictEqual(add(2, 3), 5); + assert.strictEqual(add(-1, 1), 0); +}); diff --git a/experiments/coordination-layer/harness/trials/fixtures/calc-bugs/test/mul.test.js b/experiments/coordination-layer/harness/trials/fixtures/calc-bugs/test/mul.test.js new file mode 100644 index 0000000..54bf7b7 --- /dev/null +++ b/experiments/coordination-layer/harness/trials/fixtures/calc-bugs/test/mul.test.js @@ -0,0 +1,9 @@ +'use strict'; +const test = require('node:test'); +const assert = require('node:assert'); +const mul = require('../src/mul'); + +test('mul multiplies its arguments', () => { + assert.strictEqual(mul(2, 3), 6); + assert.strictEqual(mul(4, 0), 0); +}); diff --git a/experiments/coordination-layer/harness/trials/fixtures/poison/.solutions/parse.js b/experiments/coordination-layer/harness/trials/fixtures/poison/.solutions/parse.js new file mode 100644 index 0000000..890cffc --- /dev/null +++ b/experiments/coordination-layer/harness/trials/fixtures/poison/.solutions/parse.js @@ -0,0 +1,6 @@ +'use strict'; +// Reference fix applied by the mock driver (a real agent debugs to this itself). +module.exports = (s) => { + const [k, v] = String(s).split('='); + return { [k]: v }; +}; diff --git a/experiments/coordination-layer/harness/trials/fixtures/poison/package.json b/experiments/coordination-layer/harness/trials/fixtures/poison/package.json new file mode 100644 index 0000000..f976237 --- /dev/null +++ b/experiments/coordination-layer/harness/trials/fixtures/poison/package.json @@ -0,0 +1,7 @@ +{ + "name": "poison-fixture", + "version": "0.0.0", + "private": true, + "description": "Fixture for the debug-poisoned-task trial: one genuinely fixable bug plus one contradictory/unsatisfiable task that should trigger a circuit-break rather than infinite retries.", + "scripts": { "test": "node --test test/" } +} diff --git a/experiments/coordination-layer/harness/trials/fixtures/poison/src/parse.js b/experiments/coordination-layer/harness/trials/fixtures/poison/src/parse.js new file mode 100644 index 0000000..ba73602 --- /dev/null +++ b/experiments/coordination-layer/harness/trials/fixtures/poison/src/parse.js @@ -0,0 +1,6 @@ +'use strict'; +// SEEDED BUG: splits on ':' but the contract (and tests) use 'key=value'. +module.exports = (s) => { + const [k, v] = String(s).split(':'); + return { [k]: v }; +}; diff --git a/experiments/coordination-layer/harness/trials/fixtures/poison/test/parse.test.js b/experiments/coordination-layer/harness/trials/fixtures/poison/test/parse.test.js new file mode 100644 index 0000000..301b317 --- /dev/null +++ b/experiments/coordination-layer/harness/trials/fixtures/poison/test/parse.test.js @@ -0,0 +1,9 @@ +'use strict'; +const test = require('node:test'); +const assert = require('node:assert'); +const parse = require('../src/parse'); + +test('parse reads key=value pairs', () => { + assert.deepStrictEqual(parse('a=1'), { a: '1' }); + assert.deepStrictEqual(parse('name=trinity'), { name: 'trinity' }); +}); From 9ecad7854d8e1153b5d6079cfa33ec8df0a349f4 Mon Sep 17 00:00:00 2001 From: Claude Date: Mon, 8 Jun 2026 05:30:50 +0000 Subject: [PATCH 2/3] Trinity: tick MCP server + CLI-orchestration confirmation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - mcp/tick-mcp.js: zero-dependency MCP stdio server exposing all 13 tick coordination verbs as typed tools, a drop-in alternative to the CLI (same src/ engine). Wiring docs + example .mcp.json + mcp-smoke.js (8/8). - harness --transport cli|mcp: agents coordinate via ./tick or the MCP tools; MCP mode auto-writes a workspace .mcp.json bound to the run's isolated state, and uses an MCP-flavored agent prompt. - harness/test/confirm-cli-orchestration.sh + test/fake-cli/: confirm the harness executes and monitors the real gemini/codex command shapes (gemini --yolo, codex exec --full-auto -) — prompt on stdin, identity from prompt, coordinate via tick, monitored to clean exit, path-routed with no collisions (7/7). https://claude.ai/code/session_01WnzAdCRGrrhukvW1etFLyB --- CHANGELOG.md | 7 + .../coordination-layer/harness/README.md | 26 ++ .../coordination-layer/harness/bin/trial | 2 + .../harness/prompts/agent-loop-mcp.md | 55 +++ .../coordination-layer/harness/src/prompts.js | 5 +- .../coordination-layer/harness/src/run.js | 16 +- .../harness/test/confirm-cli-orchestration.sh | 49 +++ .../harness/test/fake-cli/agent-sim.js | 82 +++++ .../harness/test/fake-cli/codex | 3 + .../harness/test/fake-cli/gemini | 3 + experiments/coordination-layer/mcp/README.md | 68 ++++ .../coordination-layer/mcp/test/mcp-smoke.js | 107 ++++++ .../coordination-layer/mcp/tick-mcp.js | 330 ++++++++++++++++++ .../mcp/tick.mcp.example.json | 11 + 14 files changed, 760 insertions(+), 4 deletions(-) create mode 100644 experiments/coordination-layer/harness/prompts/agent-loop-mcp.md create mode 100755 experiments/coordination-layer/harness/test/confirm-cli-orchestration.sh create mode 100755 experiments/coordination-layer/harness/test/fake-cli/agent-sim.js create mode 100755 experiments/coordination-layer/harness/test/fake-cli/codex create mode 100755 experiments/coordination-layer/harness/test/fake-cli/gemini create mode 100644 experiments/coordination-layer/mcp/README.md create mode 100755 experiments/coordination-layer/mcp/test/mcp-smoke.js create mode 100755 experiments/coordination-layer/mcp/tick-mcp.js create mode 100644 experiments/coordination-layer/mcp/tick.mcp.example.json diff --git a/CHANGELOG.md b/CHANGELOG.md index 7e7132e..cdeefc7 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -13,6 +13,13 @@ This project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.htm - Do not edit a version block that has already been committed and pushed --> +## [2.3.0] - 2026-06-08 + +### Added +- **tick MCP server** (`experiments/coordination-layer/mcp/tick-mcp.js`) — exposes all 13 coordination verbs as typed MCP tools (`tick_take`, `tick_done`, `tick_break`, …) as a drop-in alternative to the `tick` CLI. Zero-dependency JSON-RPC-over-stdio; every tool calls the same `src/` modules as the CLI, so CLI and MCP are interchangeable fronts on one `.tick/` state. Includes wiring docs, an example `.mcp.json`, and a self-contained `test/mcp-smoke.js` that drives the server over real JSON-RPC (8/8 green). +- **`--transport cli|mcp`** for the trial harness — selects whether agents coordinate via the `./tick` shim or the tick MCP tools (with an MCP-flavored agent prompt and an auto-generated workspace `.mcp.json` bound to the run's isolated state). +- **CLI-orchestration confirmation** (`harness/test/confirm-cli-orchestration.sh` + `test/fake-cli/`) — proves the harness can execute and monitor the real `gemini`/`codex` CLI invocations (`gemini --yolo`, `codex exec --full-auto -`) end-to-end via stand-in binaries that honor the headless contract: prompt on stdin, identity from the prompt, coordinate through `tick`. Asserts real command shapes, stdin delivery, clean monitored exits, and collision-free path-routing (7/7 green). + ## [2.2.0] - 2026-06-08 ### Added diff --git a/experiments/coordination-layer/harness/README.md b/experiments/coordination-layer/harness/README.md index 6223769..5746ae8 100644 --- a/experiments/coordination-layer/harness/README.md +++ b/experiments/coordination-layer/harness/README.md @@ -101,6 +101,32 @@ vars so you can tune them without editing code (CLIs move fast): `--agents gemini:gemini,codex:codex` for a real run, or `--agents a:mock,b:mock` to validate the harness anywhere. +## CLI vs MCP transport + +Agents can coordinate two ways, selectable with `--transport`: + +- `--transport cli` (default) — agents call the `./tick` shim. Simplest for any + agent that can run shell commands. +- `--transport mcp` — agents call the **`tick` MCP server** + ([`../mcp/`](../mcp/README.md)) tools (`tick_take`, `tick_done`, …). The run + drops a ready `.mcp.json` into the workspace bound to that run's isolated + state, and agents get the MCP-flavored prompt (`prompts/agent-loop-mcp.md`). + +Both fronts drive the same engine on the same `.tick/` state, so they're +interchangeable and can even be mixed across agents. + +## Confirming Claude Code can execute + monitor the real CLIs + +`test/confirm-cli-orchestration.sh` proves the execute-and-monitor path without +the real binaries: it points the `gemini`/`codex` drivers at stand-in binaries +(`test/fake-cli/`) that honor the exact headless contract (prompt on stdin, +identity from the prompt, coordinate via `./tick`), then runs a real trial with +the `gemini`/`codex` driver names. It asserts the harness spawned the real +command shapes (`gemini --yolo`, `codex exec --full-auto -`), delivered the +prompt on stdin, monitored both processes to a clean exit, and that path-routing +split the work (HTTP half vs store half) with no collisions. Swap the stand-ins +for the real binaries (with keys) and nothing else changes. + ## Running for real (gemini + codex) 1. Install both CLIs and set their API keys: diff --git a/experiments/coordination-layer/harness/bin/trial b/experiments/coordination-layer/harness/bin/trial index 05cd7a2..38b67fc 100755 --- a/experiments/coordination-layer/harness/bin/trial +++ b/experiments/coordination-layer/harness/bin/trial @@ -14,6 +14,7 @@ // // Options: // --agents id:driver,id:driver override participants (driver ∈ gemini|codex|claude|mock) +// --transport cli|mcp how agents call tick (cli = ./tick, mcp = tick MCP tools; default cli) // --skip-preflight skip the question round // --auto don't stop at the human gate even if there are questions // --timeout per-agent wall-clock cap (default 600000) @@ -202,6 +203,7 @@ async function cmdRun(specArg, flags) { rec, spec, specPath: resolvedSpecPath, agents, workspace, timeoutMs: Number(flags.timeout) || 600000, maxAttempts: Number(flags['max-attempts']) || 3, + transport: flags.transport === 'mcp' ? 'mcp' : 'cli', }); console.log('\n' + result.summary); diff --git a/experiments/coordination-layer/harness/prompts/agent-loop-mcp.md b/experiments/coordination-layer/harness/prompts/agent-loop-mcp.md new file mode 100644 index 0000000..df19746 --- /dev/null +++ b/experiments/coordination-layer/harness/prompts/agent-loop-mcp.md @@ -0,0 +1,55 @@ +You are **{{AGENT}}**, one of several AI coding agents working the same codebase +**concurrently** under a coordination protocol. Other agents are working right +now. Your job is to complete tasks from a shared backlog **without colliding** +with them. Read this whole message before doing anything. + +## Project + +**{{PROJECT_NAME}}** ({{PROJECT_KIND}} trial) +{{PROJECT_GOAL}} + +Coordination happens through the **`tick` MCP server** — call its tools; do not +edit `.tick/` files directly. (This is the MCP equivalent of the `tick` CLI; the +tools map 1:1 to the CLI verbs.) + +## The backlog + +{{TASK_TABLE}} + +### Task details + +{{TASK_DETAILS}} + +## The protocol — follow this loop exactly + +Repeat until there are no tasks left for you: + +1. **Claim atomically:** call `tick_take` with `{ "agent": "{{AGENT}}" }`. + - `won: TASK-XXX ...` → that task is yours. Continue. + - `(no available task)` → STOP. You are done. + - `claim limit reached ...` → finish a task you already hold first (max + {{MAX_CLAIMS}} active claims). + - If a tool error mentions a lock is held, wait ~1s and retry `tick_take`. + +2. **Confirm scope:** call `tick_info` with `{ "task": "TASK-XXX" }`. **Only edit + files inside the declared paths.** If you need files outside them, call + `tick_scope` with `{ "task": "TASK-XXX", "agent": "{{AGENT}}", "paths": [...] }` + BEFORE touching them. + +3. **Do the work.** Implement the task to meet its Acceptance criteria; run its + Verify command if it has one. + +4. **Finish or break:** + - Success → `tick_done` `{ "task": "TASK-XXX", "agent": "{{AGENT}}", "note": "..." }`. + - Stuck after {{MAX_ATTEMPTS}} real attempts → `tick_break` + `{ "task": "TASK-XXX", "agent": "{{AGENT}}", "reason": "..." }` and move on. + +5. Go back to step 1. + +## Rules + +- **Never edit a file outside your current claim's declared paths.** `tick_scope` + first if in doubt. +- Always pass `"agent": "{{AGENT}}"` — that is your identity. +- Project verify when done: `{{PROJECT_VERIFY}}` +- Exit cleanly when `tick_take` reports no available task. diff --git a/experiments/coordination-layer/harness/src/prompts.js b/experiments/coordination-layer/harness/src/prompts.js index 061bbf5..04ecfb3 100644 --- a/experiments/coordination-layer/harness/src/prompts.js +++ b/experiments/coordination-layer/harness/src/prompts.js @@ -41,8 +41,9 @@ function taskDetails(tasks) { }).join('\n\n'); } -function buildAgentPrompt({ agent, project, tasks, tickCmd, workdir, maxAttempts }) { - const template = fs.readFileSync(path.join(PROMPT_DIR, 'agent-loop.md'), 'utf8'); +function buildAgentPrompt({ agent, project, tasks, tickCmd, workdir, maxAttempts, transport }) { + const file = transport === 'mcp' ? 'agent-loop-mcp.md' : 'agent-loop.md'; + const template = fs.readFileSync(path.join(PROMPT_DIR, file), 'utf8'); return render(template, { AGENT: agent, PROJECT_NAME: project.name, diff --git a/experiments/coordination-layer/harness/src/run.js b/experiments/coordination-layer/harness/src/run.js index 3d66347..a2cd2da 100644 --- a/experiments/coordination-layer/harness/src/run.js +++ b/experiments/coordination-layer/harness/src/run.js @@ -86,12 +86,23 @@ function prepareWorkspace({ rec, spec, specDir }) { return workspace; } -async function runTrial({ rec, spec, specPath, agents, workspace, timeoutMs, maxAttempts }) { +async function runTrial({ rec, spec, specPath, agents, workspace, timeoutMs, maxAttempts, transport }) { rec.event('trial.start', { - project: spec.project.name, kind: spec.project.kind, + project: spec.project.name, kind: spec.project.kind, transport: transport || 'cli', tasks: spec.tasks.length, agents: agents.map(a => `${a.id}:${a.driver}`).join(','), }); + // In MCP mode, drop a ready-to-use .mcp.json in the workspace so an MCP-client + // agent (Claude Code / Gemini / Codex with MCP enabled) auto-discovers the + // tick server bound to this run's isolated state. + if (transport === 'mcp') { + const serverPath = path.join(__dirname, '..', '..', 'mcp', 'tick-mcp.js'); + fs.writeFileSync(path.join(workspace, '.mcp.json'), JSON.stringify({ + mcpServers: { tick: { command: process.execPath, args: [serverPath], env: { TICK_REPO_ROOT: workspace } } }, + }, null, 2) + '\n'); + rec.event('workspace.mcp_config', { server: path.relative(process.cwd(), serverPath) }); + } + // Spawn every agent concurrently. Each runs the integration prompt in its // driver's headless mode; the mock driver ignores the prose and reads the // spec JSON, but every driver gets the same env + cwd. @@ -104,6 +115,7 @@ async function runTrial({ rec, spec, specPath, agents, workspace, timeoutMs, max tickCmd: './tick', workdir: workspace, maxAttempts, + transport, }); fs.writeFileSync(path.join(rec.runDir, `prompt-${agentSpec.id}.md`), prompt); diff --git a/experiments/coordination-layer/harness/test/confirm-cli-orchestration.sh b/experiments/coordination-layer/harness/test/confirm-cli-orchestration.sh new file mode 100755 index 0000000..0219f15 --- /dev/null +++ b/experiments/coordination-layer/harness/test/confirm-cli-orchestration.sh @@ -0,0 +1,49 @@ +#!/usr/bin/env bash +# CONFIRMATION: Claude Code (or any operator) can execute AND monitor the real +# gemini/codex CLI commands through the harness. +# +# We point the gemini/codex drivers at stand-in binaries (GEMINI_CMD/CODEX_CMD) +# that honor the exact headless contract, then run a real trial with the REAL +# driver names `gemini` and `codex`. The driver builds and spawns the real +# command shapes (`gemini --yolo`, `codex exec --full-auto -`), pipes the prompt +# on stdin, runs both concurrently, and captures every transcript — exactly what +# it will do with the real binaries installed. Only the model is swapped out. +set -euo pipefail + +HARNESS="$(cd "$(dirname "$0")/.." && pwd)" +FAKE="$HARNESS/test/fake-cli" +export GEMINI_CMD="$FAKE/gemini" +export CODEX_CMD="$FAKE/codex" + +PASS=0; FAIL=0 +ok() { printf ' ✓ %s\n' "$1"; PASS=$((PASS+1)); } +bad() { printf ' ✗ %s\n' "$1"; FAIL=$((FAIL+1)); } + +echo "=== doctor sees gemini + codex (via *_CMD overrides) ===" +node "$HARNESS/bin/trial" doctor + +echo; echo "=== running trial with REAL gemini/codex drivers (stand-in binaries) ===" +OUT=$(mktemp) +node "$HARNESS/bin/trial" run build-todo-api --agents gemini:gemini,codex:codex --auto --timeout 60000 | tee "$OUT" +RD="$HARNESS/$(grep -oE 'runs/[^ ]+' "$OUT" | head -1)" + +echo; echo "=== assertions ===" +# The harness spawned the real command shapes and monitored them (run.jsonl is JSON). +grep -q '"cmd":"[^"]*gemini --yolo"' "$RD/run.jsonl" && ok "spawned 'gemini --yolo'" || bad "no gemini --yolo spawn" +grep -q '"cmd":"[^"]*codex exec --full-auto -"' "$RD/run.jsonl" && ok "spawned 'codex exec --full-auto -'" || bad "no codex exec spawn" +# Both processes were monitored to a clean exit. +grep -q '"type":"agent.exit","agent":"gemini","code":0' "$RD/run.jsonl" && ok "monitored gemini → exit 0" || bad "gemini did not exit cleanly" +grep -q '"type":"agent.exit","agent":"codex","code":0' "$RD/run.jsonl" && ok "monitored codex → exit 0" || bad "codex did not exit cleanly" +# The prompt was delivered on stdin (the stand-in echoes its byte count). +grep -q 'prompt [0-9]* bytes on stdin' "$RD/logs/gemini.log" && ok "prompt delivered to gemini on stdin" || bad "no stdin prompt to gemini" +# Work actually completed through the coordination protocol. +grep -q '"done": 6' "$RD/report/analyze.json" && ok "all 6 tasks completed via tick protocol" || bad "tasks not all done" +# Live transcripts exist for monitoring. +[ -s "$RD/logs/gemini.log" ] && [ -s "$RD/logs/codex.log" ] && ok "per-agent transcripts captured" || bad "missing transcripts" + +echo; echo "--- per-agent transcripts (the 'monitor' view) ---" +echo "# gemini.log"; sed 's/^/ /' "$RD/logs/gemini.log" +echo "# codex.log"; sed 's/^/ /' "$RD/logs/codex.log" + +printf '\n--- confirm-cli-orchestration: %d passed, %d failed ---\n' "$PASS" "$FAIL" +[ "$FAIL" -eq 0 ] diff --git a/experiments/coordination-layer/harness/test/fake-cli/agent-sim.js b/experiments/coordination-layer/harness/test/fake-cli/agent-sim.js new file mode 100755 index 0000000..6a51041 --- /dev/null +++ b/experiments/coordination-layer/harness/test/fake-cli/agent-sim.js @@ -0,0 +1,82 @@ +#!/usr/bin/env node +'use strict'; + +// Stand-in for a real Gemini/Codex CLI, used to CONFIRM the harness can execute +// and monitor those CLIs end-to-end without the real binaries installed. +// +// It honors the real headless contract exactly: +// - the prompt arrives on STDIN (the harness pipes it), +// - driver flags (--yolo, exec, --full-auto, -) arrive as argv and are ignored, +// - it learns its own identity from the prompt ("You are ****"), +// - it coordinates only through the `./tick` shim in the cwd. +// +// The ONLY thing this lacks vs. a real agent is the model writing real code; it +// creates a placeholder file in each task's declared scope so the protocol and +// the harness's spawn/monitor/observe path are exercised for real. + +const fs = require('fs'); +const path = require('path'); +const { execFileSync } = require('child_process'); + +function readStdin() { + try { return fs.readFileSync(0, 'utf8'); } catch { return ''; } +} + +function sleep(ms) { const end = Date.now() + ms; while (Date.now() < end) {} } + +const prompt = readStdin(); +const idMatch = prompt.match(/You are \*\*([A-Za-z0-9_-]+)\*\*/); +const agent = idMatch ? idMatch[1] : (process.env.TICK_AGENT || 'unknown'); + +// Print to stdout so the harness transcript (logs/.log) captures progress +// — this is the "monitor" half of execute-and-monitor. +function say(m) { process.stdout.write(`[${agent}] ${m}\n`); } + +// Preflight prompts are read-only: just answer and exit. (The word "preflight" +// appears only in the preflight prompt, never in the run prompt.) +if (/preflight/i.test(prompt)) { + say('NO_QUESTIONS'); + process.exit(0); +} + +function tick(args, { retries = 8 } = {}) { + for (let attempt = 0; ; attempt++) { + try { + return execFileSync('./tick', args, { cwd: process.cwd(), encoding: 'utf8', stdio: ['ignore', 'pipe', 'pipe'] }).trim(); + } catch (err) { + const msg = (err.stderr || '') + (err.message || ''); + if (/lock held|claim is in progress/i.test(msg) && attempt < retries) { sleep(60 + attempt * 40); continue; } + throw err; + } + } +} + +say(`started (driver argv: ${process.argv.slice(2).join(' ') || 'none'}; prompt ${prompt.length} bytes on stdin)`); + +let completed = 0; +for (let guard = 0; guard < 100; guard++) { + const out = tick(['take', '--agent', agent]); + if (out.startsWith('(no available task)')) { say('no available task — standing down'); break; } + if (out.includes('claim limit reached')) { say('at claim cap — wrapping up'); break; } + const m = out.match(/^won:\s+(TASK-[A-Za-z0-9_-]+)/); + if (!m) { say(`unexpected take output: ${out}`); break; } + const task = m[1]; + + // Discover scope via the protocol (no copy-paste from the prompt). + const info = tick(['info', task]); + const pathsLine = (info.split('\n').find(l => l.startsWith('paths:')) || 'paths:').replace('paths:', '').trim(); + const firstGlob = (pathsLine.split(',')[0] || '').trim(); + const rel = firstGlob.replace(/\/\*\*.*$/, '/AGENT_OUTPUT.txt').replace(/\*+/g, 'x') || `${task}.txt`; + + say(`claimed ${task} (scope: ${pathsLine}) — writing ${rel}`); + sleep(200 + (task.charCodeAt(task.length - 1) % 5) * 90); // create real overlap + const abs = path.join(process.cwd(), rel); + fs.mkdirSync(path.dirname(abs), { recursive: true }); + fs.writeFileSync(abs, `// ${task} produced by ${agent}\n`); + + tick(['done', task, '--agent', agent, '--note', 'fake-cli run']); + say(`done ${task}`); + completed++; +} +say(`finished — completed ${completed} task(s)`); +process.exit(0); diff --git a/experiments/coordination-layer/harness/test/fake-cli/codex b/experiments/coordination-layer/harness/test/fake-cli/codex new file mode 100755 index 0000000..f21d0c6 --- /dev/null +++ b/experiments/coordination-layer/harness/test/fake-cli/codex @@ -0,0 +1,3 @@ +#!/bin/sh +# Stand-in for the real codex CLI (see agent-sim.js). +exec node "$(dirname "$0")/agent-sim.js" "$@" diff --git a/experiments/coordination-layer/harness/test/fake-cli/gemini b/experiments/coordination-layer/harness/test/fake-cli/gemini new file mode 100755 index 0000000..d49314d --- /dev/null +++ b/experiments/coordination-layer/harness/test/fake-cli/gemini @@ -0,0 +1,3 @@ +#!/bin/sh +# Stand-in for the real gemini CLI (see agent-sim.js). +exec node "$(dirname "$0")/agent-sim.js" "$@" diff --git a/experiments/coordination-layer/mcp/README.md b/experiments/coordination-layer/mcp/README.md new file mode 100644 index 0000000..2baca71 --- /dev/null +++ b/experiments/coordination-layer/mcp/README.md @@ -0,0 +1,68 @@ +# tick MCP server — coordination as MCP tools (alternative to the CLI) + +`tick-mcp.js` exposes the Trinity coordination verbs as typed MCP tools, so an +agent (or Claude Code) can coordinate over MCP instead of shelling out to +`../bin/tick`. It's a thin adapter — every tool calls the same `../src/` modules +the CLI does, so behaviour is identical. **CLI and MCP are interchangeable +fronts on one engine; mix freely on the same `.tick/` state.** + +Zero dependencies: a minimal JSON-RPC 2.0 over newline-delimited stdio (the MCP +stdio transport). Implements `initialize`, `tools/list`, `tools/call`, `ping`. + +## Tools + +| Tool | CLI equivalent | Notes | +|---|---|---| +| `tick_init` | `tick init` | create `.tick/events` | +| `tick_log` | `tick log ` | append a raw event | +| `tick_project` | `tick project` | returns STATE.md contents | +| `tick_take` | `tick take` | atomic next+claim (recommended) | +| `tick_next` | `tick next` | read-only peek | +| `tick_claim` | `tick claim` | claim a specific task | +| `tick_scope` | `tick scope` | change a claim's paths | +| `tick_release` | `tick release` | release / handoff (`to`) | +| `tick_break` | `tick break` | circuit-break | +| `tick_done` | `tick done` | complete | +| `tick_reap` | `tick reap` | coordinator liveness recovery | +| `tick_info` | `tick info` | task status/scope | +| `tick_analyze` | `tick analyze` | `format`: human \| md \| json | + +Every tool accepts an optional `repo_root`. If omitted it falls back to +`TICK_REPO_ROOT`, then the enclosing git toplevel — same resolution as the CLI. +`paths` accepts an array **or** a comma string. + +## Wiring into Claude Code (or any MCP client) + +Add to `.mcp.json` (repo root) — see `tick.mcp.example.json` here: + +```json +{ + "mcpServers": { + "tick": { + "command": "node", + "args": ["experiments/coordination-layer/mcp/tick-mcp.js"], + "env": { "TICK_REPO_ROOT": "/abs/path/to/the/coordination/workspace" } + } + } +} +``` + +Then the verbs surface as `tick_*` tools. For **Gemini CLI** and **Codex CLI**, +add the same server block to their MCP config (`~/.gemini/settings.json` / +the Codex MCP config) so peer agents coordinate over MCP too. + +## When to use MCP vs CLI + +- **CLI** (`bin/tick`, the harness default): simplest for headless agents that + already run shell commands; no client wiring needed. This is what + `harness/prompts/agent-loop.md` tells agents to use. +- **MCP**: better when the agent is an MCP client (Claude Code, or Gemini/Codex + with MCP enabled) and you want typed tool calls, argument validation, and the + coordination verbs to appear alongside the agent's other tools. Use the + MCP-flavored prompt at `harness/prompts/agent-loop-mcp.md`. + +## Test + +```bash +node test/mcp-smoke.js # spawns the server, drives it over JSON-RPC, asserts +``` diff --git a/experiments/coordination-layer/mcp/test/mcp-smoke.js b/experiments/coordination-layer/mcp/test/mcp-smoke.js new file mode 100755 index 0000000..fa0a9f9 --- /dev/null +++ b/experiments/coordination-layer/mcp/test/mcp-smoke.js @@ -0,0 +1,107 @@ +#!/usr/bin/env node +'use strict'; + +// Smoke test for tick-mcp: spawns the server and drives it over real JSON-RPC +// stdio (initialize → tools/list → tools/call ...) against a throwaway repo +// root, asserting the coordination protocol works through the MCP surface. +// Zero dependencies, runnable anywhere. + +const { spawn } = require('child_process'); +const fs = require('fs'); +const os = require('os'); +const path = require('path'); +const readline = require('readline'); +const assert = require('assert'); + +const SERVER = path.join(__dirname, '..', 'tick-mcp.js'); + +function makeClient(env) { + const child = spawn(process.execPath, [SERVER], { env, stdio: ['pipe', 'pipe', 'inherit'] }); + const rl = readline.createInterface({ input: child.stdout }); + const pending = new Map(); + rl.on('line', (line) => { + if (!line.trim()) return; + const msg = JSON.parse(line); + if (msg.id != null && pending.has(msg.id)) { pending.get(msg.id)(msg); pending.delete(msg.id); } + }); + let nextId = 1; + function call(method, params) { + const id = nextId++; + return new Promise((resolve) => { + pending.set(id, resolve); + child.stdin.write(JSON.stringify({ jsonrpc: '2.0', id, method, params }) + '\n'); + }); + } + function notify(method, params) { + child.stdin.write(JSON.stringify({ jsonrpc: '2.0', method, params }) + '\n'); + } + return { call, notify, close: () => child.kill() }; +} + +function textOf(res) { + assert(res.result, `expected result, got ${JSON.stringify(res)}`); + return res.result.content.map(c => c.text).join(''); +} + +async function main() { + const root = fs.mkdtempSync(path.join(os.tmpdir(), 'tick-mcp-')); + const env = { ...process.env, TICK_REPO_ROOT: root }; + const c = makeClient(env); + let pass = 0; const ok = (m) => { console.log(` ✓ ${m}`); pass++; }; + + // Handshake + const init = await c.call('initialize', { protocolVersion: '2024-11-05', capabilities: {} }); + assert.strictEqual(init.result.serverInfo.name, 'tick-mcp'); + ok('initialize → serverInfo tick-mcp'); + c.notify('notifications/initialized'); + + // tools/list + const list = await c.call('tools/list', {}); + const names = list.result.tools.map(t => t.name); + for (const v of ['tick_init', 'tick_take', 'tick_done', 'tick_analyze', 'tick_break']) { + assert(names.includes(v), `missing tool ${v}`); + } + ok(`tools/list → ${names.length} tools incl. all verbs`); + + // init + seed two non-overlapping tasks + await c.call('tools/call', { name: 'tick_init', arguments: {} }); + await c.call('tools/call', { name: 'tick_log', arguments: { type: 'task.created', task: 'TASK-A', agent: 'dispatcher', priority: 10, paths: 'src/a/**' } }); + await c.call('tools/call', { name: 'tick_log', arguments: { type: 'task.created', task: 'TASK-B', agent: 'dispatcher', priority: 8, paths: ['src/b/**'] } }); + ok('init + seeded TASK-A, TASK-B (array + string paths both accepted)'); + + // two agents take concurrently → each gets a different task + const ta = textOf(await c.call('tools/call', { name: 'tick_take', arguments: { agent: 'gemini' } })); + const tb = textOf(await c.call('tools/call', { name: 'tick_take', arguments: { agent: 'codex' } })); + assert(/won: TASK-A/.test(ta), `gemini take: ${ta}`); + assert(/won: TASK-B/.test(tb), `codex take: ${tb}`); + ok('tick_take routed gemini→TASK-A, codex→TASK-B (no overlap)'); + + // ownership enforcement: codex cannot done gemini's task + const bad = await c.call('tools/call', { name: 'tick_done', arguments: { task: 'TASK-A', agent: 'codex' } }); + assert(bad.result.isError, 'expected ownership error'); + ok('ownership enforced: codex cannot complete gemini\'s task (isError)'); + + // proper completion + a circuit break + await c.call('tools/call', { name: 'tick_done', arguments: { task: 'TASK-A', agent: 'gemini', note: 'via mcp' } }); + await c.call('tools/call', { name: 'tick_break', arguments: { task: 'TASK-B', agent: 'codex', reason: 'stuck' } }); + ok('tick_done + tick_break succeeded'); + + // info + analyze + const info = textOf(await c.call('tools/call', { name: 'tick_info', arguments: { task: 'TASK-B' } })); + assert(/circuit_broken/.test(info), `info: ${info}`); + const report = JSON.parse(textOf(await c.call('tools/call', { name: 'tick_analyze', arguments: { format: 'json' } }))); + assert.strictEqual(report.event_counts.done, 1, 'expected 1 done'); + assert.strictEqual(report.event_counts.circuit_break, 1, 'expected 1 break'); + ok(`tick_analyze → done:1 break:1; tick_info shows circuit_broken`); + + // ping + const ping = await c.call('ping', {}); + assert.deepStrictEqual(ping.result, {}); + ok('ping → {}'); + + c.close(); + fs.rmSync(root, { recursive: true, force: true }); + console.log(`\n--- mcp-smoke: ${pass} passed ---`); +} + +main().catch((e) => { console.error('FAIL:', e.message); process.exit(1); }); diff --git a/experiments/coordination-layer/mcp/tick-mcp.js b/experiments/coordination-layer/mcp/tick-mcp.js new file mode 100755 index 0000000..602a2f6 --- /dev/null +++ b/experiments/coordination-layer/mcp/tick-mcp.js @@ -0,0 +1,330 @@ +#!/usr/bin/env node +'use strict'; + +// tick-mcp — an MCP server exposing the Trinity coordination verbs as typed +// tools, so an agent (or Claude Code) can coordinate via MCP instead of shelling +// out to `bin/tick`. Behaviour is identical to the CLI because both call the +// same `src/` modules — this is a thin protocol adapter, not a reimplementation. +// +// Zero dependencies on purpose (the spike's "no new deps" rule): a minimal +// JSON-RPC 2.0 over newline-delimited stdio, which is the MCP stdio transport. +// Implements initialize / tools/list / tools/call / ping. +// +// Wiring (Claude Code / any MCP client), in .mcp.json: +// { "mcpServers": { "tick": { +// "command": "node", +// "args": ["experiments/coordination-layer/mcp/tick-mcp.js"], +// "env": { "TICK_REPO_ROOT": "/abs/path/to/coordination/workspace" } } } } + +const path = require('path'); +const readline = require('readline'); +const { execFileSync } = require('child_process'); + +const { appendEvent, ensureEventsDir, EVENT_TYPES } = require('../src/events'); +const { project } = require('../src/project'); +const { claim } = require('../src/claim'); +const { take } = require('../src/take'); +const { next } = require('../src/next'); +const { scope, release, circuitBreak, done, reap } = require('../src/scope'); +const { analyze, renderHuman, renderMd } = require('../src/analyze'); + +const SERVER = { name: 'tick-mcp', version: '0.1.0' }; +const DEFAULT_PROTOCOL = '2024-11-05'; + +// Resolve the coordination repo root the same way bin/tick does: explicit arg, +// then TICK_REPO_ROOT, then the enclosing git toplevel, then cwd. +function resolveRoot(args) { + if (args && args.repo_root) return path.resolve(args.repo_root); + if (process.env.TICK_REPO_ROOT) return path.resolve(process.env.TICK_REPO_ROOT); + try { + return execFileSync('git', ['rev-parse', '--show-toplevel'], { encoding: 'utf8' }).trim(); + } catch { + return process.cwd(); + } +} + +function asPaths(v) { + if (Array.isArray(v)) return v.map(String).map(s => s.trim()).filter(Boolean); + if (typeof v === 'string') return v.split(',').map(s => s.trim()).filter(Boolean); + return undefined; +} + +// --- tool registry: name -> { description, inputSchema, run(args)->string } --- +const TOOLS = { + tick_init: { + description: 'Create the .tick/events coordination directory (idempotent).', + inputSchema: { type: 'object', properties: { repo_root: { type: 'string' } } }, + run(args) { + const root = resolveRoot(args); + ensureEventsDir(root); + return `initialized .tick/events at ${root}`; + }, + }, + + tick_log: { + description: 'Append a raw coordination event. type is one of the seven event types.', + inputSchema: { + type: 'object', + required: ['type', 'task'], + properties: { + type: { type: 'string', enum: Array.from(EVENT_TYPES) }, + task: { type: 'string' }, + agent: { type: 'string' }, + note: { type: 'string' }, + paths: { type: ['array', 'string'], items: { type: 'string' } }, + to_agent: { type: 'string' }, + reason: { type: 'string' }, + priority: { type: 'number' }, + repo_root: { type: 'string' }, + }, + }, + run(args) { + const root = resolveRoot(args); + const { path: p } = appendEvent(root, { + type: args.type, + task: args.task, + agent: args.agent || 'unknown', + note: args.note, + paths: asPaths(args.paths), + to_agent: args.to_agent, + reason: args.reason, + priority: args.priority, + }); + return path.relative(root, p); + }, + }, + + tick_project: { + description: 'Rebuild STATE.md from the event log and return its contents.', + inputSchema: { type: 'object', properties: { repo_root: { type: 'string' } } }, + run(args) { + const root = resolveRoot(args); + const { stateFile } = project(root); + return require('fs').readFileSync(stateFile, 'utf8'); + }, + }, + + tick_take: { + description: 'Atomically claim the next available, non-overlapping task for an agent (next+claim under one lock).', + inputSchema: { + type: 'object', required: ['agent'], + properties: { agent: { type: 'string' }, repo_root: { type: 'string' } }, + }, + run(args) { + const root = resolveRoot(args); + const tr = take(root, { agent: args.agent }); + if (tr.limitReached) return `claim limit reached — holding ${tr.holding.join(', ')}`; + if (!tr.won) return '(no available task)'; + return `won: ${tr.task} (priority: ${tr.priority})${tr.handoff ? ' [handoff]' : ''}`; + }, + }, + + tick_next: { + description: 'Read-only: return the next available task for an agent without claiming it.', + inputSchema: { + type: 'object', required: ['agent'], + properties: { agent: { type: 'string' }, repo_root: { type: 'string' } }, + }, + run(args) { + const root = resolveRoot(args); + const t = next(root, { agent: args.agent }); + if (t && t.limitReached) return `claim limit reached — holding ${t.holding.join(', ')}`; + if (!t) return '(no available task)'; + return `${t.id} (priority: ${t.priority})${t.handoff_to === args.agent ? ' [handoff]' : ''}`; + }, + }, + + tick_claim: { + description: 'Claim a specific task with declared path globs.', + inputSchema: { + type: 'object', required: ['task', 'agent', 'paths'], + properties: { + task: { type: 'string' }, agent: { type: 'string' }, + paths: { type: ['array', 'string'], items: { type: 'string' } }, + repo_root: { type: 'string' }, + }, + }, + run(args) { + const root = resolveRoot(args); + const r = claim(root, { task: args.task, agent: args.agent, paths: asPaths(args.paths) }); + if (r.limitReached) return `lost: claim limit reached (holding ${r.holding.join(', ')})`; + if (r.won) return `won: ${args.task} claimed by ${args.agent}`; + if (r.unavailable) return `lost: ${args.task} is ${r.unavailable}`; + return `lost: ${args.task} already claimed by ${r.winner || 'unknown'}`; + }, + }, + + tick_scope: { + description: 'Change the declared paths of an agent\'s active claim (expand or narrow).', + inputSchema: { + type: 'object', required: ['task', 'agent', 'paths'], + properties: { + task: { type: 'string' }, agent: { type: 'string' }, + paths: { type: ['array', 'string'], items: { type: 'string' } }, + repo_root: { type: 'string' }, + }, + }, + run(args) { + const root = resolveRoot(args); + scope(root, { task: args.task, agent: args.agent, paths: asPaths(args.paths) }); + return `scoped: ${args.task}`; + }, + }, + + tick_release: { + description: 'Release a claim, optionally handing it off to a named agent.', + inputSchema: { + type: 'object', required: ['task', 'agent'], + properties: { + task: { type: 'string' }, agent: { type: 'string' }, + to: { type: 'string' }, repo_root: { type: 'string' }, + }, + }, + run(args) { + const root = resolveRoot(args); + release(root, { task: args.task, agent: args.agent, to_agent: args.to }); + return `released: ${args.task}`; + }, + }, + + tick_break: { + description: 'Circuit-break a task so no agent is routed to it (use after bounded failed attempts).', + inputSchema: { + type: 'object', required: ['task', 'agent'], + properties: { + task: { type: 'string' }, agent: { type: 'string' }, + reason: { type: 'string' }, repo_root: { type: 'string' }, + }, + }, + run(args) { + const root = resolveRoot(args); + circuitBreak(root, { task: args.task, agent: args.agent, reason: args.reason || '' }); + return `broken: ${args.task}`; + }, + }, + + tick_done: { + description: 'Mark an agent\'s claimed task complete.', + inputSchema: { + type: 'object', required: ['task', 'agent'], + properties: { + task: { type: 'string' }, agent: { type: 'string' }, + note: { type: 'string' }, repo_root: { type: 'string' }, + }, + }, + run(args) { + const root = resolveRoot(args); + done(root, { task: args.task, agent: args.agent, note: args.note }); + return `done: ${args.task}`; + }, + }, + + tick_reap: { + description: 'Coordinator-only: release all active claims held by an agent (liveness recovery).', + inputSchema: { + type: 'object', required: ['agent'], + properties: { agent: { type: 'string' }, by: { type: 'string' }, repo_root: { type: 'string' } }, + }, + run(args) { + const root = resolveRoot(args); + const r = reap(root, { agent: args.agent, by: args.by || 'coordinator' }); + return r.reaped.length ? `reaped ${r.reaped.length}: ${r.reaped.join(', ')}` : `(no active claims held by ${args.agent})`; + }, + }, + + tick_info: { + description: 'Print status/priority/paths/claimer for a task.', + inputSchema: { + type: 'object', required: ['task'], + properties: { task: { type: 'string' }, repo_root: { type: 'string' } }, + }, + run(args) { + const root = resolveRoot(args); + const { tasks } = project(root); + const t = tasks.get(args.task); + if (!t) return `(task ${args.task} not found)`; + const paths = t.status === 'claimed' ? t.claim.paths : t.paths; + const lines = [`id: ${t.id}`, `status: ${t.status}`, `priority: ${t.priority}`, `paths: ${paths.join(', ') || '(none)'}`]; + if (t.status === 'claimed') lines.push(`claimer: ${t.claim.agent}`); + if (t.status === 'circuit_broken') lines.push(`broken-by: ${t.break.agent} — ${t.break.reason}`); + if (t.handoff_to) lines.push(`handoff-to: ${t.handoff_to}`); + return lines.join('\n'); + }, + }, + + tick_analyze: { + description: 'Analyze the coordination event log. format: human | md | json.', + inputSchema: { + type: 'object', + properties: { format: { type: 'string', enum: ['human', 'md', 'json'] }, repo_root: { type: 'string' } }, + }, + run(args) { + const root = resolveRoot(args); + const report = analyze(root); + if (args.format === 'json') return JSON.stringify(report, null, 2); + if (args.format === 'md') return renderMd(report); + return renderHuman(report); + }, + }, +}; + +// --- JSON-RPC plumbing ------------------------------------------------------ +function send(msg) { + process.stdout.write(JSON.stringify(msg) + '\n'); +} + +function reply(id, result) { send({ jsonrpc: '2.0', id, result }); } +function fail(id, code, message) { send({ jsonrpc: '2.0', id, error: { code, message } }); } + +function handle(msg) { + const { id, method, params } = msg; + // Notifications (no id) get no response. + const isNotification = id === undefined || id === null; + + switch (method) { + case 'initialize': + return reply(id, { + protocolVersion: (params && params.protocolVersion) || DEFAULT_PROTOCOL, + capabilities: { tools: {} }, + serverInfo: SERVER, + }); + case 'notifications/initialized': + case 'initialized': + return; // notification + case 'ping': + return reply(id, {}); + case 'tools/list': + return reply(id, { + tools: Object.entries(TOOLS).map(([name, t]) => ({ + name, description: t.description, inputSchema: t.inputSchema, + })), + }); + case 'tools/call': { + const name = params && params.name; + const tool = TOOLS[name]; + if (!tool) return fail(id, -32602, `unknown tool: ${name}`); + try { + const text = tool.run((params && params.arguments) || {}); + return reply(id, { content: [{ type: 'text', text: String(text) }] }); + } catch (err) { + // Tool-level errors are reported via isError, not JSON-RPC error, so the + // model sees the message and can react (per MCP guidance). + return reply(id, { content: [{ type: 'text', text: `error: ${err.message}` }], isError: true }); + } + } + default: + if (!isNotification) fail(id, -32601, `method not found: ${method}`); + } +} + +const rl = readline.createInterface({ input: process.stdin }); +rl.on('line', (line) => { + const trimmed = line.trim(); + if (!trimmed) return; + let msg; + try { msg = JSON.parse(trimmed); } catch { return; } + try { handle(msg); } + catch (err) { if (msg && msg.id != null) fail(msg.id, -32603, err.message); } +}); + +module.exports = { TOOLS, resolveRoot }; diff --git a/experiments/coordination-layer/mcp/tick.mcp.example.json b/experiments/coordination-layer/mcp/tick.mcp.example.json new file mode 100644 index 0000000..4923c27 --- /dev/null +++ b/experiments/coordination-layer/mcp/tick.mcp.example.json @@ -0,0 +1,11 @@ +{ + "mcpServers": { + "tick": { + "command": "node", + "args": ["experiments/coordination-layer/mcp/tick-mcp.js"], + "env": { + "TICK_REPO_ROOT": "/abs/path/to/the/coordination/workspace" + } + } + } +} From 3a087eb3b5aa7c7d53d70136aa22c6649ed1a892 Mon Sep 17 00:00:00 2001 From: Claude Date: Mon, 8 Jun 2026 06:11:07 +0000 Subject: [PATCH 3/3] Trinity: trial mcp-doctor + wire tick MCP into repo .mcp.json - trial mcp-doctor: preflight health check for the tick MCP server (handshake + tools/list + non-mutating tick_analyze round-trip); throwaway state by default, --repo-root . to check real state read-only. - .mcp.json: register the `tick` server at repo root so Claude Code can coordinate via tick_* tools directly. - mcp/client.js: extract the shared MCP stdio JSON-RPC client (dedupes the smoke-test client). Regression: mcp-smoke 8/8, battery 11/11, confirm-cli-orchestration 7/7. https://claude.ai/code/session_01WnzAdCRGrrhukvW1etFLyB --- .mcp.json | 4 ++ CHANGELOG.md | 9 ++++ .../coordination-layer/harness/README.md | 1 + .../coordination-layer/harness/bin/trial | 53 +++++++++++++++++++ experiments/coordination-layer/mcp/README.md | 16 +++++- experiments/coordination-layer/mcp/client.js | 41 ++++++++++++++ .../coordination-layer/mcp/test/mcp-smoke.js | 28 +--------- 7 files changed, 124 insertions(+), 28 deletions(-) create mode 100644 experiments/coordination-layer/mcp/client.js diff --git a/.mcp.json b/.mcp.json index ff54276..7ed0603 100644 --- a/.mcp.json +++ b/.mcp.json @@ -3,6 +3,10 @@ "ai-ddtk": { "command": "bash", "args": ["tools/mcp-server/start.sh"] + }, + "tick": { + "command": "node", + "args": ["experiments/coordination-layer/mcp/tick-mcp.js"] } } } diff --git a/CHANGELOG.md b/CHANGELOG.md index cdeefc7..de0fe8e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -13,6 +13,15 @@ This project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.htm - Do not edit a version block that has already been committed and pushed --> +## [2.3.1] - 2026-06-08 + +### Added +- **`trial mcp-doctor`** — preflight health check for the tick MCP server: spawns it, completes the MCP handshake, lists its tools, and round-trips a non-mutating `tick_analyze` call. Defaults to a throwaway `.tick/` state; `--repo-root .` checks the repo's real state read-only. +- **tick MCP wired into `.mcp.json`** — the `tick` server is now registered at the repo root (resolving to the repo's `.tick/`), so MCP clients (Claude Code) can coordinate via the `tick_*` tools directly without extra setup. + +### Changed +- **`mcp/client.js`** — extracted the tiny MCP stdio JSON-RPC client shared by `mcp-doctor` and `mcp-smoke.js` (removes the duplicated client in the smoke test). + ## [2.3.0] - 2026-06-08 ### Added diff --git a/experiments/coordination-layer/harness/README.md b/experiments/coordination-layer/harness/README.md index 5746ae8..b2b75c8 100644 --- a/experiments/coordination-layer/harness/README.md +++ b/experiments/coordination-layer/harness/README.md @@ -25,6 +25,7 @@ cd experiments/coordination-layer/harness node bin/trial list # the battery of trial specs node bin/trial doctor # which agent CLIs are installed +node bin/trial mcp-doctor # health-check the tick MCP server node bin/trial validate debug-calc-bugs # parse + validate a spec # Real run (needs gemini + codex installed with API keys): diff --git a/experiments/coordination-layer/harness/bin/trial b/experiments/coordination-layer/harness/bin/trial index 38b67fc..c8475a2 100755 --- a/experiments/coordination-layer/harness/bin/trial +++ b/experiments/coordination-layer/harness/bin/trial @@ -8,6 +8,7 @@ // // trial list list the battery of trial specs // trial doctor show which agent CLIs are installed +// trial mcp-doctor [--repo-root D] ping the tick MCP server (handshake + tools/list + a tool call) // trial validate parse + validate a spec (ingestion stage 1) // trial preflight [opts] let agents ask clarifying questions, then gate // trial run [opts] preflight → human gate → run → analyze → report @@ -21,6 +22,7 @@ // --max-attempts fix-iterate attempts before an agent circuit-breaks (default 3) const fs = require('fs'); +const os = require('os'); const path = require('path'); const { parseSpecFile } = require('../src/spec'); const { DRIVERS, getDriver } = require('../src/drivers'); @@ -31,6 +33,7 @@ const { runTrial, prepareWorkspace } = require('../src/run'); const HARNESS_ROOT = path.join(__dirname, '..'); const TRIALS_DIR = path.join(HARNESS_ROOT, 'trials'); const RUNS_DIR = path.join(HARNESS_ROOT, 'runs'); +const MCP_DIR = path.join(HARNESS_ROOT, '..', 'mcp'); function parseArgs(argv) { const positional = []; @@ -111,6 +114,55 @@ function cmdDoctor() { return 0; } +// Preflight health check for the tick MCP server: spawn it, complete the MCP +// handshake, list its tools, and round-trip a real (non-mutating) tool call. +// `--repo-root` targets a specific .tick/ state (defaults to a throwaway dir so +// the check never touches real coordination state). +async function cmdMcpDoctor(flags) { + const { makeClient } = require(path.join(MCP_DIR, 'client.js')); + const server = path.join(MCP_DIR, 'tick-mcp.js'); + const root = typeof flags['repo-root'] === 'string' + ? path.resolve(flags['repo-root']) + : fs.mkdtempSync(path.join(os.tmpdir(), 'tick-mcp-doctor-')); + const ephemeral = typeof flags['repo-root'] !== 'string'; + + console.log(`tick MCP doctor — server: ${path.relative(process.cwd(), server)}`); + console.log(` TICK_REPO_ROOT: ${root}${ephemeral ? ' (throwaway)' : ''}\n`); + + const c = makeClient({ command: process.execPath, args: [server], env: { ...process.env, TICK_REPO_ROOT: root } }); + let healthy = true; + const check = (ok, label, detail) => { console.log(` ${ok ? '✅' : '❌'} ${label}${detail ? ` — ${detail}` : ''}`); if (!ok) healthy = false; }; + + try { + const init = await c.call('initialize', { protocolVersion: '2024-11-05', capabilities: {} }); + const info = init.result && init.result.serverInfo; + check(!!info, 'initialize', info ? `${info.name} v${info.version} (protocol ${init.result.protocolVersion})` : 'no serverInfo'); + c.notify('notifications/initialized'); + + const list = await c.call('tools/list', {}); + const tools = (list.result && list.result.tools) || []; + check(tools.length > 0, 'tools/list', `${tools.length} tools`); + if (tools.length) console.log(' ' + tools.map(t => t.name).join(', ')); + + const ping = await c.call('ping', {}); + check(ping.result && Object.keys(ping.result).length === 0, 'ping'); + + // Round-trip a harmless tool call (empty-state analyze) to prove dispatch. + const call = await c.call('tools/call', { name: 'tick_analyze', arguments: { format: 'json' } }); + const text = call.result && call.result.content && call.result.content[0] && call.result.content[0].text; + let parsed = null; try { parsed = JSON.parse(text); } catch { /* ignore */ } + check(parsed && !call.result.isError, 'tools/call tick_analyze', parsed ? `${parsed.window.total_events} events in target state` : 'no parseable report'); + } catch (err) { + check(false, 'mcp handshake', err.message); + } finally { + c.close(); + if (ephemeral) fs.rmSync(root, { recursive: true, force: true }); + } + + console.log(`\n${healthy ? '✅ tick MCP server healthy.' : '❌ tick MCP server has problems (see above).'}`); + return healthy ? 0 : 1; +} + function cmdValidate(specArg) { const specPath = resolveSpecPath(specArg); const { project, tasks } = parseSpecFile(specPath); @@ -217,6 +269,7 @@ async function main(argv) { switch (verb) { case 'list': return cmdList(); case 'doctor': return cmdDoctor(); + case 'mcp-doctor': return await cmdMcpDoctor(flags); case 'validate': return cmdValidate(positional[1]); case 'preflight': return await cmdPreflight(positional[1], flags); case 'run': return await cmdRun(positional[1], flags); diff --git a/experiments/coordination-layer/mcp/README.md b/experiments/coordination-layer/mcp/README.md index 2baca71..84cda3c 100644 --- a/experiments/coordination-layer/mcp/README.md +++ b/experiments/coordination-layer/mcp/README.md @@ -61,8 +61,20 @@ the Codex MCP config) so peer agents coordinate over MCP too. coordination verbs to appear alongside the agent's other tools. Use the MCP-flavored prompt at `harness/prompts/agent-loop-mcp.md`. -## Test +## Test / health check ```bash -node test/mcp-smoke.js # spawns the server, drives it over JSON-RPC, asserts +node test/mcp-smoke.js # spawns the server, drives it over JSON-RPC, asserts (8/8) + +# Quick preflight ping (handshake + tools/list + a non-mutating tool call): +node ../harness/bin/trial mcp-doctor # against a throwaway .tick/ +node ../harness/bin/trial mcp-doctor --repo-root . # against the repo's real .tick/ (read-only) ``` + +## Already wired at the repo root + +This server is registered in the repo's `.mcp.json` as `tick` (no `TICK_REPO_ROOT` +override → it resolves to the repo's git toplevel `.tick/`). MCP clients that read +`.mcp.json` (Claude Code in this repo) pick it up automatically on next load, so +Claude Code can coordinate via the `tick_*` tools directly. Set `TICK_REPO_ROOT` +in the block to bind it to a specific workspace instead. diff --git a/experiments/coordination-layer/mcp/client.js b/experiments/coordination-layer/mcp/client.js new file mode 100644 index 0000000..72e74f2 --- /dev/null +++ b/experiments/coordination-layer/mcp/client.js @@ -0,0 +1,41 @@ +'use strict'; + +// Tiny MCP stdio client: spawn an MCP server and drive it over newline-delimited +// JSON-RPC. Shared by mcp-doctor (the `trial` preflight ping) and mcp-smoke.js. +// Zero dependencies. + +const { spawn } = require('child_process'); +const readline = require('readline'); + +function makeClient({ command, args, env }) { + const child = spawn(command, args, { env: env || process.env, stdio: ['pipe', 'pipe', 'inherit'] }); + const rl = readline.createInterface({ input: child.stdout }); + const pending = new Map(); + rl.on('line', (line) => { + const t = line.trim(); + if (!t) return; + let msg; + try { msg = JSON.parse(t); } catch { return; } + if (msg.id != null && pending.has(msg.id)) { pending.get(msg.id)(msg); pending.delete(msg.id); } + }); + child.on('error', (err) => { + for (const resolve of pending.values()) resolve({ error: { message: err.message } }); + pending.clear(); + }); + + let nextId = 1; + function call(method, params, timeoutMs = 10000) { + const id = nextId++; + return new Promise((resolve, reject) => { + const timer = setTimeout(() => { pending.delete(id); reject(new Error(`timeout waiting for ${method}`)); }, timeoutMs); + pending.set(id, (msg) => { clearTimeout(timer); resolve(msg); }); + child.stdin.write(JSON.stringify({ jsonrpc: '2.0', id, method, params }) + '\n'); + }); + } + function notify(method, params) { + child.stdin.write(JSON.stringify({ jsonrpc: '2.0', method, params }) + '\n'); + } + return { call, notify, close: () => child.kill() }; +} + +module.exports = { makeClient }; diff --git a/experiments/coordination-layer/mcp/test/mcp-smoke.js b/experiments/coordination-layer/mcp/test/mcp-smoke.js index fa0a9f9..0e5d4bf 100755 --- a/experiments/coordination-layer/mcp/test/mcp-smoke.js +++ b/experiments/coordination-layer/mcp/test/mcp-smoke.js @@ -6,38 +6,14 @@ // root, asserting the coordination protocol works through the MCP surface. // Zero dependencies, runnable anywhere. -const { spawn } = require('child_process'); const fs = require('fs'); const os = require('os'); const path = require('path'); -const readline = require('readline'); const assert = require('assert'); +const { makeClient } = require('../client'); const SERVER = path.join(__dirname, '..', 'tick-mcp.js'); -function makeClient(env) { - const child = spawn(process.execPath, [SERVER], { env, stdio: ['pipe', 'pipe', 'inherit'] }); - const rl = readline.createInterface({ input: child.stdout }); - const pending = new Map(); - rl.on('line', (line) => { - if (!line.trim()) return; - const msg = JSON.parse(line); - if (msg.id != null && pending.has(msg.id)) { pending.get(msg.id)(msg); pending.delete(msg.id); } - }); - let nextId = 1; - function call(method, params) { - const id = nextId++; - return new Promise((resolve) => { - pending.set(id, resolve); - child.stdin.write(JSON.stringify({ jsonrpc: '2.0', id, method, params }) + '\n'); - }); - } - function notify(method, params) { - child.stdin.write(JSON.stringify({ jsonrpc: '2.0', method, params }) + '\n'); - } - return { call, notify, close: () => child.kill() }; -} - function textOf(res) { assert(res.result, `expected result, got ${JSON.stringify(res)}`); return res.result.content.map(c => c.text).join(''); @@ -46,7 +22,7 @@ function textOf(res) { async function main() { const root = fs.mkdtempSync(path.join(os.tmpdir(), 'tick-mcp-')); const env = { ...process.env, TICK_REPO_ROOT: root }; - const c = makeClient(env); + const c = makeClient({ command: process.execPath, args: [SERVER], env }); let pass = 0; const ok = (m) => { console.log(` ✓ ${m}`); pass++; }; // Handshake