diff --git a/.agents/skills/choosing-swarm-patterns/SKILL.md b/.agents/skills/choosing-swarm-patterns/SKILL.md new file mode 100644 index 0000000..86878c3 --- /dev/null +++ b/.agents/skills/choosing-swarm-patterns/SKILL.md @@ -0,0 +1,261 @@ +--- +name: choosing-swarm-patterns +description: Use when coordinating multiple AI agents and need to pick the right orchestration pattern - covers 10 patterns (fan-out, pipeline, hub-spoke, consensus, mesh, handoff, cascade, dag, debate, hierarchical) with decision framework and reflection protocol +--- + +### Overview + +10 orchestration patterns for multi-agent workflows. Pick the simplest pattern that solves the problem — add complexity only when the system proves it's insufficient. + +### Quick Decision Framework + +#### ``` + +``` +Is the task independent per agent? + YES → fan-out (parallel workers) + +Does each step need the previous step's output? + YES → Is it strictly linear? + YES → pipeline + NO → dag (parallel where possible) + +Does a coordinator need to stay alive and adapt? + YES → Is there one level of management? + YES → hub-spoke + NO → hierarchical (multi-level) + +Is the task about making a decision? + YES → Do agents need to argue opposing sides? + YES → debate (adversarial) + NO → consensus (cooperative voting) + +Does the right specialist emerge during processing? + YES → handoff (dynamic routing) + +Do all agents need to freely collaborate? + YES → mesh (peer-to-peer) + +Is cost the primary concern? + YES → cascade (cheap model first, escalate if needed) +``` + + +### Pattern Reference + +| # | Pattern | Topology | Agents | Best For | +|---|---------|----------|--------|----------| +| 1 | **fan-out** | Star (SDK center) | N parallel | Independent subtasks (reviews, research, tests) | +| 2 | **pipeline** | Linear chain | Sequential | Ordered stages (design → implement → test) | +| 3 | **hub-spoke** | Star (live hub) | 1 lead + N workers | Dynamic coordination, lead reviews/adjusts | +| 4 | **consensus** | Broadcast + vote | N voters | Architecture decisions, approval gates | +| 5 | **mesh** | Fully connected | N peers | Brainstorming, collaborative debugging | +| 6 | **handoff** | Routing chain | 1 active at a time | Triage, specialist routing, support flows | +| 7 | **cascade** | Tiered escalation | Cheapest → most capable | Cost optimization, production workloads | +| 8 | **dag** | Dependency graph | Parallel + joins | Complex projects with mixed dependencies | +| 9 | **debate** | Adversarial rounds | 2+ debaters + judge | Rigorous evaluation, architecture trade-offs | +| 10 | **hierarchical** | Tree (multi-level) | Lead → coordinators → workers | Large teams, domain separation | + +### Pattern Details + +#### 1. fan-out — Parallel Workers + +```ts +fanOut([ + { task: "Review auth.ts", name: "AuthReviewer" }, + { task: "Review db.ts", name: "DbReviewer" }, +], { cli: "claude" }); +``` + +#### 2. pipeline — Sequential Stages + +```ts +pipeline([ + { task: "Design the API schema", name: "Designer" }, + { task: "Implement the endpoints", name: "Implementer" }, + { task: "Write integration tests", name: "Tester" }, +]); +``` + +#### 3. hub-spoke — Persistent Coordinator + +```ts +hubAndSpoke({ + hub: { task: "Coordinate building a REST API", name: "Lead" }, + workers: [ + { task: "Build database models", name: "DbWorker" }, + { task: "Build route handlers", name: "ApiWorker" }, + ], +}); +``` + +#### 4. consensus — Cooperative Voting + +```ts +consensus({ + proposal: "Should we migrate to Fastify?", + voters: [ + { task: "Evaluate performance", name: "PerfExpert" }, + { task: "Evaluate DX", name: "DxExpert" }, + ], + consensusType: "majority", +}); +``` + +#### 5. mesh — Peer Collaboration + +```ts +mesh({ + goal: "Debug the auth flow returning 500", + agents: [ + { task: "Check server logs", name: "LogAnalyst" }, + { task: "Review auth code", name: "CodeReviewer" }, + { task: "Write repro test", name: "Tester" }, + ], +}); +``` + +#### 6. handoff — Dynamic Routing + +```ts +handoff({ + entryPoint: { task: "Triage the request", name: "Triage" }, + routes: [ + { agent: { task: "Handle billing", name: "Billing" }, condition: "billing, payment" }, + { agent: { task: "Handle tech issues", name: "TechSupport" }, condition: "error, bug" }, + ], + maxHandoffs: 3, +}); +``` + +#### 7. cascade — Cost-Aware Escalation + +```ts +cascade({ + tiers: [ + { agent: { task: "Answer this", cli: "claude" }, confidenceThreshold: 0.7, costWeight: 1 }, + { agent: { task: "Answer this", cli: "claude" }, confidenceThreshold: 0.85, costWeight: 5 }, + { agent: { task: "Answer this", cli: "claude" }, costWeight: 20 }, + ], +}); +``` + +#### 8. dag — Directed Acyclic Graph + +```ts +dag({ + nodes: [ + { id: "scaffold", task: "Create project scaffold" }, + { id: "frontend", task: "Build React UI", dependsOn: ["scaffold"] }, + { id: "backend", task: "Build API", dependsOn: ["scaffold"] }, + { id: "integrate", task: "Wire together", dependsOn: ["frontend", "backend"] }, + ], + maxConcurrency: 3, +}); +``` + +#### 9. debate — Adversarial Refinement + +```ts +debate({ + topic: "Monorepo vs polyrepo for the new platform?", + debaters: [ + { task: "Argue for monorepo", position: "monorepo" }, + { task: "Argue for polyrepo", position: "polyrepo" }, + ], + judge: { task: "Judge and decide", name: "ArchJudge" }, + maxRounds: 3, +}); +``` + +#### 10. hierarchical — Multi-Level Delegation + +```ts +hierarchical({ + agents: [ + { id: "lead", task: "Coordinate full-stack app", role: "lead" }, + { id: "fe-coord", task: "Manage frontend", role: "coordinator", reportsTo: "lead" }, + { id: "be-coord", task: "Manage backend", role: "coordinator", reportsTo: "lead" }, + { id: "fe-dev", task: "Build components", role: "worker", reportsTo: "fe-coord" }, + { id: "be-dev", task: "Build API", role: "worker", reportsTo: "be-coord" }, + ], +}); +``` + + +### Reflection Protocol + +#### All patterns support reflection — periodic synthesis that enables course correction. Enabled via `reflectionThreshold` on WorkflowOptions. + +```ts +{ + reflectionThreshold: 10, // trigger after 10 agent messages + onReflect: async (ctx) => { + // Examine ctx.recentMessages, ctx.agentStatuses + // Return adjustments or null + }, +} +``` + + +### Common Mistakes + +| Mistake | Why It Fails | Fix | +|---------|-------------|-----| +| Using mesh for everything | O(n^2) communication, debugging nightmare | Use hub-spoke for most tasks | +| Pipeline for independent work | Sequential bottleneck | Use fan-out or dag | +| Hub-spoke for simple parallel tasks | Hub is unnecessary overhead | Use fan-out | +| Consensus for non-decisions | Voting on implementation tasks wastes time | Use hub-spoke, let lead decide | +| No circuit breaker on handoff | Infinite routing loops | Always set maxHandoffs | +| Cascade without confidence parsing | Agents don't report confidence | Convention injection handles this | +| Hierarchical for 3 agents | Management overhead exceeds benefit | Use hub-spoke for small teams | + +### DAG Executor — Proven Pattern + +#### Agent Completion: Detect → Release → Collect + +``` +Agent writes summary file → Orchestrator polls (5s) → Detects new mtime → + Reads summary → Calls client.release(agent) → agent_exited fires → Node marked complete +``` + +#### State & Resume + +```ts +saveState(completed, depsOutput, results, startTime); +// Restart with --resume to skip completed nodes +``` + + +### YAML Workflow Definition + +#### Any pattern can be defined in YAML for portability: + +```yaml +version: "1.0" +name: feature-dev +pattern: hub-spoke +agents: + - id: lead + role: lead + cli: claude + - id: developer + role: worker + cli: codex + reportsTo: lead +steps: + - id: plan + agent: lead + prompt: "Create a development plan for: {{task}}" + expects: "PLAN_COMPLETE" + - id: implement + agent: developer + dependsOn: [plan] + prompt: "Implement: {{steps.plan.output}}" + expects: "DONE" +reflection: + enabled: true + threshold: 10 +trajectory: + enabled: true +``` diff --git a/.agents/skills/relay-80-100-workflow/SKILL.md b/.agents/skills/relay-80-100-workflow/SKILL.md new file mode 100644 index 0000000..93aa90e --- /dev/null +++ b/.agents/skills/relay-80-100-workflow/SKILL.md @@ -0,0 +1,363 @@ +--- +name: relay-80-100-workflow +description: Use when writing agent-relay workflows that must fully validate features end-to-end before merging. Covers the 80-to-100 pattern - going beyond "code compiles" to "feature works, tested E2E locally." Includes PGlite for in-memory Postgres testing, mock sandbox patterns, test-fix-rerun loops, verify gates after every edit, and the full lifecycle from implementation through passing tests to commit. +--- + +### Overview + +Most agent workflows get features to ~80%: code written, types check, maybe a build passes. This skill covers the **80-to-100 gap** — making workflows that fully validate features end-to-end before committing. The goal: every feature merged via these workflows is **tested, verified, and known-working**, not just "it compiles." + +### When to Use + +- Writing workflows where the deliverable must be **production-ready**, not just code-complete +- Features that touch databases, APIs, or infrastructure that can be tested locally +- Any workflow where "it compiles" is not sufficient proof of correctness +- When you want confidence that the commit actually works before deploying + +### Core Principle: Test In The Workflow + +#### The key insight: **run tests as deterministic steps inside the workflow itself**. Don't just write test files — execute them, verify they pass, fix failures, and re-run. The workflow doesn't commit until tests are green. + +``` +implement → write tests → run tests → fix failures → re-run → build check → regression check → commit +``` + + +### The Test-Fix-Rerun Pattern + +#### Every testable feature in a workflow should follow this three-step pattern: + +```typescript +// Step 1: Run tests (allow failure — we expect issues on first run) +.step('run-tests', { + type: 'deterministic', + dependsOn: ['create-tests'], + command: 'npx tsx --test tests/my-feature.test.ts 2>&1 | tail -60', + captureOutput: true, + failOnError: false, // <-- Don't fail the workflow, let the agent fix it +}) + +// Step 2: Agent reads output, fixes issues, re-runs until green +.step('fix-tests', { + agent: 'tester', + dependsOn: ['run-tests'], + task: `Check the test output and fix any failures. + +Test output: +{{steps.run-tests.output}} + +If all tests passed, do nothing. +If there are failures: +1. Read the failing test file and source files +2. Fix the issues (could be in test or source) +3. Re-run: npx tsx --test tests/my-feature.test.ts +4. Keep fixing until ALL tests pass.`, + verification: { type: 'exit_code' }, +}) + +// Step 3: Deterministic final run — this one MUST pass +.step('run-tests-final', { + type: 'deterministic', + dependsOn: ['fix-tests'], + command: 'npx tsx --test tests/my-feature.test.ts 2>&1', + captureOutput: true, + failOnError: true, // <-- Hard fail if tests still broken +}) +``` + + +### PGlite: In-Memory Postgres for Database Testing + +#### Setup + +```typescript +.step('install-pglite', { + type: 'deterministic', + command: 'npm install --save-dev @electric-sql/pglite 2>&1 | tail -5', + captureOutput: true, +}) +``` + +#### Test Helper Pattern + +```typescript +// tests/helpers/pglite-db.ts +import { PGlite } from '@electric-sql/pglite'; +import { drizzle } from 'drizzle-orm/pglite'; +import * as schema from '../../packages/web/lib/db/schema.js'; + +// Raw DDL matching your Drizzle schema — PGlite doesn't run Drizzle migrations +const MY_TABLE_DDL = ` +CREATE TABLE IF NOT EXISTS my_table ( + id UUID PRIMARY KEY DEFAULT gen_random_uuid(), + name TEXT NOT NULL, + created_at TIMESTAMPTZ NOT NULL DEFAULT NOW() +); +`; + +export async function createTestDb() { + const pg = new PGlite(); + await pg.exec(MY_TABLE_DDL); + const db = drizzle(pg, { schema }); + return { db, pg, schema, cleanup: () => pg.close() }; +} +``` + +#### Test Structure + +```typescript +// tests/my-feature.test.ts +import { describe, it } from 'node:test'; +import assert from 'node:assert/strict'; +import { randomUUID } from 'node:crypto'; +import { createTestDb } from './helpers/pglite-db.js'; + +describe('my feature', () => { + it('does the thing correctly', async () => { + const { db, schema, cleanup } = await createTestDb(); + try { + // Arrange + const testId = randomUUID(); + // Act — use your module against the real (in-memory) Postgres + // Assert + assert.equal(result.name, 'expected'); + } finally { + await cleanup(); + } + }); +}); +``` + + +### Verify Gates After Every Edit + +#### Never trust that an agent edited a file correctly. Add a deterministic verify gate after every agent edit step: + +```typescript +// Agent edits a file +.step('edit-schema', { + agent: 'impl', + dependsOn: ['read-schema'], + task: `Edit packages/web/lib/db/schema.ts...`, + verification: { type: 'exit_code' }, +}) + +// Deterministic verification — did the edit actually land? +.step('verify-schema', { + type: 'deterministic', + dependsOn: ['edit-schema'], + command: `if git diff --quiet packages/web/lib/db/schema.ts; then echo "NOT MODIFIED"; exit 1; fi +grep "my_new_table" packages/web/lib/db/schema.ts >/dev/null && echo "OK" || (echo "MISSING"; exit 1)`, + failOnError: true, + captureOutput: true, +}) +``` + + +### Mock Sandbox Pattern + +#### When testing code that interacts with Daytona sandboxes, use inline mock objects matching the existing test conventions: + +```typescript +const daytona = { + create: async () => ({ + id: 'sandbox-id', + process: { + executeCommand: async (cmd, cwd, env) => ({ + result: 'output', + exitCode: 0, + }), + }, + fs: { + uploadFile: async () => undefined, + }, + getUserHomeDir: async () => '/home/daytona', + }), + remove: async () => undefined, +}; +``` + + +### Regression Testing + +#### After your new tests pass, always run the **existing test suite** to catch regressions: + +```typescript +.step('run-existing-tests', { + type: 'deterministic', + dependsOn: ['fix-build'], + command: 'npm run orchestrator:test 2>&1 | tail -40', + captureOutput: true, + failOnError: false, +}) + +.step('fix-regressions', { + agent: 'impl', + dependsOn: ['run-existing-tests'], + task: `Check the full test suite for regressions caused by our changes. + +Test output: +{{steps.run-existing-tests.output}} + +If all tests passed, do nothing. +If EXISTING tests broke, read the failing test, find what we broke, fix it. +Most likely cause: constructor signatures changed, new required fields added +without defaults, or import paths shifted. + +Run: npm run orchestrator:test +Fix until all tests pass.`, + verification: { type: 'exit_code' }, +}) +``` + + +### Full Workflow Template + +#### Here's the complete pattern for a feature that touches the database: + +```typescript +import { workflow } from '@agent-relay/sdk/workflows'; + +const result = await workflow('my-feature') + .description('Add feature X with full E2E validation') + .pattern('dag') + .channel('wf-my-feature') + .maxConcurrency(3) + .timeout(3_600_000) + + .agent('impl', { cli: 'claude', preset: 'worker', retries: 2 }) + .agent('tester', { cli: 'claude', preset: 'worker', retries: 2 }) + + // ── Phase 1: Read ──────────────────────────────────────────────── + .step('read-target', { + type: 'deterministic', + command: 'cat path/to/file.ts', + captureOutput: true, + }) + + // ── Phase 2: Implement ─────────────────────────────────────────── + .step('edit-target', { + agent: 'impl', + dependsOn: ['read-target'], + task: `Edit path/to/file.ts. Current contents: +{{steps.read-target.output}} + +Only edit this one file.`, + verification: { type: 'exit_code' }, + }) + .step('verify-target', { + type: 'deterministic', + dependsOn: ['edit-target'], + command: 'git diff --quiet path/to/file.ts && (echo "NOT MODIFIED"; exit 1) || echo "OK"', + failOnError: true, + captureOutput: true, + }) + + // ── Phase 3: Test infrastructure ───────────────────────────────── + .step('install-pglite', { + type: 'deterministic', + command: 'npm install --save-dev @electric-sql/pglite 2>&1 | tail -5', + captureOutput: true, + }) + .step('create-test-helpers', { + agent: 'tester', + dependsOn: ['install-pglite'], + task: 'Create tests/helpers/pglite-db.ts with ...', + verification: { type: 'file_exists', value: 'tests/helpers/pglite-db.ts' }, + }) + .step('create-tests', { + agent: 'tester', + dependsOn: ['create-test-helpers', 'verify-target'], + task: 'Create tests/my-feature.test.ts with ...', + verification: { type: 'file_exists', value: 'tests/my-feature.test.ts' }, + }) + + // ── Phase 4: Test-fix-rerun loop ───────────────────────────────── + .step('run-tests', { + type: 'deterministic', + dependsOn: ['create-tests'], + command: 'npx tsx --test tests/my-feature.test.ts 2>&1 | tail -60', + captureOutput: true, + failOnError: false, + }) + .step('fix-tests', { + agent: 'tester', + dependsOn: ['run-tests'], + task: `Fix any test failures. Output:\n{{steps.run-tests.output}}`, + verification: { type: 'exit_code' }, + }) + .step('run-tests-final', { + type: 'deterministic', + dependsOn: ['fix-tests'], + command: 'npx tsx --test tests/my-feature.test.ts 2>&1', + captureOutput: true, + failOnError: true, + }) + + // ── Phase 5: Build + regression ────────────────────────────────── + .step('build-check', { + type: 'deterministic', + dependsOn: ['run-tests-final'], + command: 'npx tsc --noEmit 2>&1 | tail -20; echo "EXIT: $?"', + captureOutput: true, + failOnError: false, + }) + .step('fix-build', { + agent: 'impl', + dependsOn: ['build-check'], + task: `Fix type errors if any. Output:\n{{steps.build-check.output}}`, + verification: { type: 'exit_code' }, + }) + .step('run-existing-tests', { + type: 'deterministic', + dependsOn: ['fix-build'], + command: 'npm test 2>&1 | tail -40', + captureOutput: true, + failOnError: false, + }) + .step('fix-regressions', { + agent: 'impl', + dependsOn: ['run-existing-tests'], + task: `Fix regressions if any. Output:\n{{steps.run-existing-tests.output}}`, + verification: { type: 'exit_code' }, + }) + + // ── Phase 6: Commit ────────────────────────────────────────────── + .step('commit', { + type: 'deterministic', + dependsOn: ['fix-regressions'], + command: 'git add && git commit -m "feat: ..."', + captureOutput: true, + failOnError: true, + }) + + .onError('retry', { maxRetries: 2, retryDelayMs: 10_000 }) + .run({ cwd: process.cwd() }); +``` + + +### Checklist: Is Your Workflow 80-to-100? + +| Check | How | +|-------|-----| +| Tests exist | `file_exists` verification on test file | +| Tests actually run | Deterministic step executes them | +| Test failures get fixed | Agent step reads output, fixes, re-runs | +| Final test run is hard-gated | `failOnError: true` on last test step | +| Build passes | `npx tsc --noEmit` deterministic step | +| No regressions | Existing test suite runs after changes | +| Every edit is verified | `git diff --quiet` + grep after each agent edit | +| Commit only happens after all gates | `dependsOn` chains to final verification | + +### Common Anti-Patterns + +| Anti-pattern | Why it fails | Fix | +|-------------|-------------|-----| +| Tests written but never executed | Agent claims they pass, they don't | Add deterministic `run-tests` step | +| Single `failOnError: true` test run | First failure kills workflow, no chance to fix | Use the three-step test-fix-rerun pattern | +| No regression test | New feature works, old features break | Run `npm test` after build check | +| Agent asked to "write and run tests" in one step | Agent writes tests, runs them, they fail, it edits, output is garbled | Separate write/run/fix into distinct steps | +| PGlite DDL doesn't match Drizzle schema | Tests pass on wrong schema | Derive DDL from schema.ts or test with real migration | +| `failOnError: false` on final test run | Broken tests get committed | Always `failOnError: true` on the gate step | +| Testing only happy path | Edge cases break in prod | Specify edge case tests in the task prompt | +| No verify gate after agent edits | Agent exits 0 without writing anything | Add `git diff --quiet` check after every edit | diff --git a/.agents/skills/writing-agent-relay-workflows/SKILL.md b/.agents/skills/writing-agent-relay-workflows/SKILL.md new file mode 100644 index 0000000..333727b --- /dev/null +++ b/.agents/skills/writing-agent-relay-workflows/SKILL.md @@ -0,0 +1,574 @@ +--- +name: writing-agent-relay-workflows +description: Use when building multi-agent workflows with the relay broker-sdk - covers the WorkflowBuilder API, DAG step dependencies, agent definitions, step output chaining via {{steps.X.output}}, verification gates, evidence-based completion, owner decisions, dedicated channels, dynamic channel management (subscribe/unsubscribe/mute/unmute), swarm patterns, error handling, event listeners, step sizing rules, authoring best practices, and the lead+workers team pattern for complex steps +--- + +### Overview + +The relay broker-sdk workflow system orchestrates multiple AI agents (Claude, Codex, Gemini, Aider, Goose) through typed DAG-based workflows. Workflows can be written in **TypeScript** (preferred), **Python**, or **YAML**. + +**Language preference:** TypeScript > Python > YAML. Use TypeScript unless the project is Python-only or a simple config-driven workflow suits YAML. + +**Pattern selection:** Do not default to `dag` blindly. If the job needs a different swarm/workflow type, consult the `choosing-swarm-patterns` skill when available and select the pattern that best matches the coordination problem. + +### When to Use + +- Building multi-agent workflows with step dependencies +- Orchestrating different AI CLIs (claude, codex, gemini, aider, goose) +- Creating DAG, pipeline, fan-out, or other swarm patterns +- Needing verification gates, retries, or step output chaining +- Dynamic channel management: agents joining/leaving/muting channels mid-workflow + +### Quick Reference + +#### ```typescript + +```typescript +import { workflow } from '@agent-relay/sdk/workflows'; + +const result = await workflow('my-workflow') + .description('What this workflow does') + .pattern('dag') // or 'pipeline', 'fan-out', etc. + .channel('wf-my-workflow') // dedicated channel (auto-generated if omitted) + .maxConcurrency(3) + .timeout(3_600_000) // global timeout (ms) + + .agent('lead', { cli: 'claude', role: 'Architect', retries: 2 }) + .agent('worker', { cli: 'codex', role: 'Implementer', retries: 2 }) + + .step('plan', { + agent: 'lead', + task: `Analyze the codebase and produce a plan.`, + retries: 2, + verification: { type: 'output_contains', value: 'PLAN_COMPLETE' }, + }) + .step('implement', { + agent: 'worker', + task: `Implement based on this plan:\n{{steps.plan.output}}`, + dependsOn: ['plan'], + verification: { type: 'exit_code' }, + }) + + .onError('retry', { maxRetries: 2, retryDelayMs: 10_000 }) + .run({ cwd: process.cwd() }); + + console.log('Result:', result.status); +``` + + +### ⚡ Parallelism — Design for Speed + +#### Cross-Workflow Parallelism: Wave Planning + +```bash +# BAD — sequential (14 hours for 27 workflows at ~30 min each) +agent-relay run workflows/34-sst-wiring.ts +agent-relay run workflows/35-env-config.ts +agent-relay run workflows/36-loading-states.ts +# ... one at a time + +# GOOD — parallel waves (3-4 hours for 27 workflows) +# Wave 1: independent infra (parallel) +agent-relay run workflows/34-sst-wiring.ts & +agent-relay run workflows/35-env-config.ts & +agent-relay run workflows/36-loading-states.ts & +agent-relay run workflows/37-responsive.ts & +wait +git add -A && git commit -m "Wave 1" + +# Wave 2: testing (parallel — independent test suites) +agent-relay run workflows/40-unit-tests.ts & +agent-relay run workflows/41-integration-tests.ts & +agent-relay run workflows/42-e2e-tests.ts & +wait +git add -A && git commit -m "Wave 2" +``` + +#### Declare File Scope for Planning + +```typescript +workflow('48-comparison-mode') + .packages(['web', 'core']) // monorepo packages touched + .isolatedFrom(['49-feedback-system']) // explicitly safe to parallelize + .requiresBefore(['46-admin-dashboard']) // explicit ordering constraint +``` + +#### Within-Workflow Parallelism + +```typescript +// BAD — unnecessary sequential chain +.step('fix-component-a', { agent: 'worker', dependsOn: ['review'] }) +.step('fix-component-b', { agent: 'worker', dependsOn: ['fix-component-a'] }) // why wait? + +// GOOD — parallel fan-out, merge at the end +.step('fix-component-a', { agent: 'impl-1', dependsOn: ['review'] }) +.step('fix-component-b', { agent: 'impl-2', dependsOn: ['review'] }) // same dep = parallel +.step('verify-all', { agent: 'reviewer', dependsOn: ['fix-component-a', 'fix-component-b'] }) +``` + + +### Failure Prevention + +#### 1. Do not use raw top-level `await` + +```ts +async function runWorkflow() { + const result = await workflow('my-workflow') + // ... + .run({ cwd: process.cwd() }); + + console.log('Workflow status:', result.status); +} + +runWorkflow().catch((error) => { + console.error(error); + process.exit(1); +}); +``` + +#### 3. Keep final verification boring and deterministic + +```bash +grep -Eq "foo|bar|baz" file.ts +``` + +#### 6. Be explicit about shell requirements + +```bash +/opt/homebrew/bin/bash workflows/your-workflow/execute.sh --wave 2 +``` + + +### End-to-End Bug Fix Workflows + +- **Capture the original failure** +- Reproduce the bug first in a deterministic or evidence-capturing step +- Save exact commands, logs, status codes, or screenshots/artifacts +- **State the acceptance contract** +- Define the exact end-to-end success criteria before implementation +- Include the real entrypoint a user would run +- **Implement the fix** +- **Rebuild / reinstall from scratch** +- Do not trust dirty local state +- Prefer a clean environment when install/bootstrap behavior is involved +- **Run targeted regression checks** +- Unit/integration tests are helpful but not sufficient by themselves +- **Run a full end-to-end validation** +- Use the real CLI / API / install path +- Prefer a clean environment (Docker, sandbox, cloud workspace, Daytona, etc.) for install/runtime issues +- **Compare before vs after evidence** +- Show that the original failure no longer occurs +- **Record residual risks** +- Call out what was not covered +- disposable sandbox / cloud workspace +- Docker / containerized environment +- fresh local shell with isolated paths +- compares candidate validation environments +- defines the acceptance contract +- chooses the best swarm pattern +- then authors the final fix/validation workflow + +### Key Concepts + +#### Verification Gates + +```typescript +verification: { type: 'exit_code' } // preferred for code-editing steps +verification: { type: 'output_contains', value: 'DONE' } // optional accelerator +verification: { type: 'file_exists', value: 'src/out.ts' } // deterministic file check +``` + +#### DAG Dependencies + +```typescript +.step('fix-types', { agent: 'worker', dependsOn: ['review'], ... }) +.step('fix-tests', { agent: 'worker', dependsOn: ['review'], ... }) +.step('final', { agent: 'lead', dependsOn: ['fix-types', 'fix-tests'], ... }) +``` + +#### SDK API + +```typescript +// Subscribe an agent to additional channels post-spawn +relay.subscribe({ agent: 'security-auditor', channels: ['review-pr-456'] }); + +// Unsubscribe — agent leaves the channel entirely +relay.unsubscribe({ agent: 'security-auditor', channels: ['general'] }); + +// Mute — agent stays subscribed (history access) but messages are NOT injected into PTY +relay.mute({ agent: 'security-auditor', channel: 'review-pr-123' }); + +// Unmute — resume PTY injection +relay.unmute({ agent: 'security-auditor', channel: 'review-pr-123' }); +``` + +#### Events + +```typescript +relay.onChannelSubscribed = (agent, channels) => { /* ... */ }; +relay.onChannelUnsubscribed = (agent, channels) => { /* ... */ }; +relay.onChannelMuted = (agent, channel) => { /* ... */ }; +relay.onChannelUnmuted = (agent, channel) => { /* ... */ }; +``` + + +### Agent Definition + +#### ```typescript + +```typescript +.agent('name', { + cli: 'claude' | 'codex' | 'gemini' | 'aider' | 'goose' | 'opencode' | 'droid', + role?: string, + preset?: 'lead' | 'worker' | 'reviewer' | 'analyst', + retries?: number, + model?: string, + interactive?: boolean, // default: true +}) +``` + +#### Model Constants + +```typescript +import { ClaudeModels, CodexModels, GeminiModels } from '@agent-relay/config'; + +.agent('planner', { cli: 'claude', model: ClaudeModels.OPUS }) // not 'opus' +.agent('worker', { cli: 'claude', model: ClaudeModels.SONNET }) // not 'sonnet' +.agent('coder', { cli: 'codex', model: CodexModels.GPT_5_4 }) // not 'gpt-5.4' +``` + + +### Step Definition + +#### Agent Steps + +```typescript +.step('name', { + agent: string, + task: string, // supports {{var}} and {{steps.NAME.output}} + dependsOn?: string[], + verification?: VerificationCheck, + retries?: number, +}) +``` + +#### Deterministic Steps (Shell Commands) + +```typescript +.step('verify-files', { + type: 'deterministic', + command: 'test -f src/auth.ts && echo "FILE_EXISTS"', + dependsOn: ['implement'], + captureOutput: true, + failOnError: true, +}) +``` + + +### Common Patterns + +#### Interactive Team (lead + workers on shared channel) + +```typescript +.agent('lead', { + cli: 'claude', + model: ClaudeModels.OPUS, + role: 'Architect and reviewer — assigns work, reviews, posts feedback', + retries: 1, + // No preset — interactive by default +}) + +.agent('impl-new', { + cli: 'codex', + model: CodexModels.O3, + role: 'Creates new files. Listens on channel for assignments and feedback.', + retries: 2, + // No preset — interactive, receives channel messages +}) + +.agent('impl-modify', { + cli: 'codex', + model: CodexModels.O3, + role: 'Edits existing files. Listens on channel for assignments and feedback.', + retries: 2, +}) + +// All three share the same dependsOn — they start concurrently (no deadlock) +.step('lead-coordinate', { + agent: 'lead', + dependsOn: ['context'], + task: `You are the lead on #channel. Workers: impl-new, impl-modify. +Post the plan. Assign files. Review their work. Post feedback if needed. +Workers iterate based on your feedback. Exit when all files are correct.`, +}) +.step('impl-new-work', { + agent: 'impl-new', + dependsOn: ['context'], // same dep as lead = parallel start + task: `You are impl-new on #channel. Wait for the lead's plan. +Create files as assigned. Report completion. Fix issues from feedback.`, +}) +.step('impl-modify-work', { + agent: 'impl-modify', + dependsOn: ['context'], // same dep as lead = parallel start + task: `You are impl-modify on #channel. Wait for the lead's plan. +Edit files as assigned. Report completion. Fix issues from feedback.`, +}) +// Downstream gates on lead (lead exits when satisfied) +.step('verify', { type: 'deterministic', dependsOn: ['lead-coordinate'], ... }) +``` + +#### Pipeline (sequential handoff) + +```typescript +.pattern('pipeline') +.step('analyze', { agent: 'analyst', task: '...' }) +.step('implement', { agent: 'dev', task: '{{steps.analyze.output}}', dependsOn: ['analyze'] }) +.step('test', { agent: 'tester', task: '{{steps.implement.output}}', dependsOn: ['implement'] }) +``` + +#### Error Handling + +```typescript +.onError('fail-fast') // stop on first failure (default) +.onError('continue') // skip failed branches, continue others +.onError('retry', { maxRetries: 3, retryDelayMs: 5000 }) +``` + + +### Multi-File Edit Pattern + +#### When a workflow needs to modify multiple existing files, **use one agent step per file** with a deterministic verify gate after each. Agents reliably edit 1-2 files per step but fail on 4+. + +```yaml +steps: + - name: read-types + type: deterministic + command: cat src/types.ts + captureOutput: true + + - name: edit-types + agent: dev + dependsOn: [read-types] + task: | + Edit src/types.ts. Current contents: + {{steps.read-types.output}} + Add 'pending' to the Status union type. + Only edit this one file. + verification: + type: exit_code + + - name: verify-types + type: deterministic + dependsOn: [edit-types] + command: 'if git diff --quiet src/types.ts; then echo "NOT MODIFIED"; exit 1; fi; echo "OK"' + failOnError: true + + - name: read-service + type: deterministic + dependsOn: [verify-types] + command: cat src/service.ts + captureOutput: true + + - name: edit-service + agent: dev + dependsOn: [read-service] + task: | + Edit src/service.ts. Current contents: + {{steps.read-service.output}} + Add a handlePending() method. + Only edit this one file. + verification: + type: exit_code + + - name: verify-service + type: deterministic + dependsOn: [edit-service] + command: 'if git diff --quiet src/service.ts; then echo "NOT MODIFIED"; exit 1; fi; echo "OK"' + failOnError: true + + # Deterministic commit — never rely on agents to commit + - name: commit + type: deterministic + dependsOn: [verify-service] + command: git add src/types.ts src/service.ts && git commit -m "feat: add pending status" + failOnError: true +``` + + +### File Materialization: Verify Before Proceeding + +#### After any step that creates files, add a deterministic `file_exists` check before proceeding. Non-interactive agents may exit 0 without writing anything (wrong cwd, stdout instead of disk). + +```yaml +- name: verify-files + type: deterministic + dependsOn: [impl-auth, impl-storage] + command: | + missing=0 + for f in src/auth/credentials.ts src/storage/client.ts; do + if [ ! -f "$f" ]; then echo "MISSING: $f"; missing=$((missing+1)); fi + done + if [ $missing -gt 0 ]; then echo "$missing files missing"; exit 1; fi + echo "All files present" + failOnError: true +``` + + +### DAG Deadlock Anti-Pattern + +#### ```yaml + +```yaml +# WRONG — deadlock: coordinate depends on context, work-a depends on coordinate +steps: + - name: coordinate + dependsOn: [context] # lead waits for WORKER_DONE... + - name: work-a + dependsOn: [coordinate] # ...but work-a can't start until coordinate finishes + +# RIGHT — workers and lead start in parallel +steps: + - name: context + type: deterministic + - name: work-a + dependsOn: [context] # starts with lead + - name: coordinate + dependsOn: [context] # starts with workers + - name: merge + dependsOn: [work-a, coordinate] +``` + + +### Step Sizing + +#### **One agent, one deliverable.** A step's task prompt should be 10-20 lines max. + +```yaml +# Team pattern: lead + workers on a shared channel +steps: + - name: track-lead-coord + agent: track-lead + dependsOn: [prior-step] + task: | + Lead the track on #my-track. Workers: track-worker-1, track-worker-2. + Post assignments to the channel. Review worker output. + + - name: track-worker-1-impl + agent: track-worker-1 + dependsOn: [prior-step] # same dep as lead — starts concurrently + task: | + Join #my-track. track-lead will post your assignment. + Implement the file as directed. + verification: + type: exit_code + + - name: next-step + dependsOn: [track-lead-coord] # downstream depends on lead, not workers +``` + + +### Supervisor Pattern + +When you set `.pattern('supervisor')` (or `hub-spoke`, `fan-out`), the runner auto-assigns a supervisor agent as owner for worker steps. The supervisor monitors progress, nudges idle workers, and issues `OWNER_DECISION`. + +**Auto-hardening only activates for hub patterns** — not `pipeline` or `dag`. + +| Use case | Pattern | Why | +|----------|---------|-----| +| Sequential, no monitoring | `pipeline` | Simple, no overhead | +| Workers need oversight | `supervisor` | Auto-owner monitors | +| Local/small models | `supervisor` | Supervisor catches stuck workers | +| All non-interactive | `pipeline` or `dag` | No PTY = no supervision needed | + +### Concurrency + +**Cap `maxConcurrency` at 4-6.** Spawning 10+ agents simultaneously causes broker timeouts. + +| Parallel agents | `maxConcurrency` | +|-----------------|-------------------| +| 2-4 | 4 (default safe) | +| 5-10 | 5 | +| 10+ | 6-8 max | + +### Common Mistakes + +| Mistake | Fix | +|---------|-----| +| All workflows run sequentially | Group independent workflows into parallel waves (4-7x speedup) | +| Every step depends on the previous one | Only add `dependsOn` when there's a real data dependency | +| Self-review step with no timeout | Set `timeout: 300_000` (5 min) — Codex hangs in non-interactive review | +| One giant workflow per feature | Split into smaller workflows that can run in parallel waves | +| Adding exit instructions to tasks | Runner handles self-termination automatically | +| Setting `timeoutMs` on agents/steps | Use global `.timeout()` only | +| Using `general` channel | Set `.channel('wf-name')` for isolation | +| `{{steps.X.output}}` without `dependsOn: ['X']` | Output won't be available yet | +| Requiring exact sentinel as only completion gate | Use `exit_code` or `file_exists` verification | +| Writing 100-line task prompts | Split into lead + workers on a channel | +| `maxConcurrency: 16` with many parallel steps | Cap at 5-6 | +| Non-interactive agent reading large files via tools | Pre-read in deterministic step, inject via `{{steps.X.output}}` | +| Workers depending on lead step (deadlock) | Both depend on shared context step | +| `fan-out`/`hub-spoke` for simple parallel workers | Use `dag` instead | +| `pipeline` but expecting auto-supervisor | Only hub patterns auto-harden. Use `.pattern('supervisor')` | +| Workers without `preset: 'worker'` in one-shot DAG lead+worker flows | Add preset for clean stdout when chaining `{{steps.X.output}}` (not needed for interactive team patterns) | +| Using `_` in YAML numbers (`timeoutMs: 1_200_000`) | YAML doesn't support `_` separators | +| Workflow timeout under 30 min for complex workflows | Use `3600000` (1 hour) as default | +| Using `require()` in ESM projects | Check `package.json` for `"type": "module"` — use `import` if ESM | +| Wrapping in `async function main()` in ESM | ESM supports top-level `await` — no wrapper needed | +| Using `createWorkflowRenderer` | Does not exist. Use `.run({ cwd: process.cwd() })` | +| `export default workflow(...)...build()` | No `.build()`. Chain ends with `.run()` — the file must call `.run()`, not just export config | +| Relative import `'../workflows/builder.js'` | Use `import { workflow } from '@agent-relay/sdk/workflows'` | +| Hardcoded model strings (`model: 'opus'`) | Use constants: `import { ClaudeModels } from '@agent-relay/config'` → `model: ClaudeModels.OPUS` | +| Thinking `agent-relay run` inspects exports | It executes the file as a subprocess. Only `.run()` invocations trigger steps | +| `pattern('single')` on cloud runner | Not supported — use `dag` | +| `pattern('supervisor')` with one agent | Same agent is owner + specialist. Use `dag` | +| Invalid verification type (`type: 'deterministic'`) | Only `exit_code`, `output_contains`, `file_exists`, `custom` are valid | +| Chaining `{{steps.X.output}}` from interactive agents | PTY output is garbled. Use deterministic steps or `preset: 'worker'` | +| Single step editing 4+ files | Agents modify 1-2 then exit. Split to one file per step with verify gates | +| Relying on agents to `git commit` | Agents emit markers without running git. Use deterministic commit step | +| File-writing steps without `file_exists` verification | `exit_code` auto-passes even if no file written | +| Manual peer fanout in `handleChannelMessage()` | Use broker-managed channel subscriptions — broker fans out to all subscribers automatically | +| Client-side `personaNames.has(from)` filtering | Use `relay.subscribe()`/`relay.unsubscribe()` — only subscribed agents receive messages | +| Agents receiving noisy cross-channel messages during focused work | Use `relay.mute({ agent, channel })` to silence non-primary channels without leaving them | +| Hardcoding all channels at spawn time | Use `agent.subscribe()` / `agent.unsubscribe()` for dynamic channel membership post-spawn | +| Using `preset: 'worker'` for Codex in *interactive team* patterns when coordination is needed | Codex interactive mode works fine with PTY channel injection. Drop the preset for interactive team patterns (keep it for one-shot DAG workers where clean stdout matters) | +| Separate reviewer agent from lead in interactive team | Merge lead + reviewer into one interactive Claude agent — reviews between rounds, fewer agents | +| Not printing PR URL after `gh pr create` | Add a final deterministic step: `echo "PR: $(cat pr-url.txt)"` or capture in the `gh pr create` command | +| Workflow ending without worktree + PR for cross-repo changes | Add `setup-worktree` at start and `push-and-pr` + `cleanup-worktree` at end | + +### YAML Alternative + +#### ```yaml + +```yaml +version: '1.0' +name: my-workflow +swarm: + pattern: dag + channel: wf-my-workflow +agents: + - name: lead + cli: claude + role: Architect + - name: worker + cli: codex + role: Implementer +workflows: + - name: default + steps: + - name: plan + agent: lead + task: 'Produce a detailed implementation plan.' + - name: implement + agent: worker + task: 'Implement: {{steps.plan.output}}' + dependsOn: [plan] + verification: + type: exit_code +``` + + +### Available Swarm Patterns + +`dag` (default), `fan-out`, `pipeline`, `hub-spoke`, `consensus`, `mesh`, `handoff`, `cascade`, `debate`, `hierarchical`, `map-reduce`, `scatter-gather`, `supervisor`, `reflection`, `red-team`, `verifier`, `auction`, `escalation`, `saga`, `circuit-breaker`, `blackboard`, `swarm` + +See skill `choosing-swarm-patterns` for pattern selection guidance. diff --git a/.claude/settings.json b/.claude/settings.json new file mode 100644 index 0000000..5123f1d --- /dev/null +++ b/.claude/settings.json @@ -0,0 +1,7 @@ +{ + "permissions": { + "allow": [ + "mcp__relaycast__*" + ] + } +} diff --git a/.claude/skills/choosing-swarm-patterns/SKILL.md b/.claude/skills/choosing-swarm-patterns/SKILL.md new file mode 100644 index 0000000..f77862e --- /dev/null +++ b/.claude/skills/choosing-swarm-patterns/SKILL.md @@ -0,0 +1,338 @@ +--- +name: choosing-swarm-patterns +description: Use when coordinating multiple AI agents and need to pick the right orchestration pattern - covers 10 patterns (fan-out, pipeline, hub-spoke, consensus, mesh, handoff, cascade, dag, debate, hierarchical) with decision framework and reflection protocol +--- + +# Choosing Swarm Patterns + +## Overview + +10 orchestration patterns for multi-agent workflows. Pick the simplest pattern that solves the problem — add complexity only when the system proves it's insufficient. + +## Quick Decision Framework + +``` +Is the task independent per agent? + YES → fan-out (parallel workers) + +Does each step need the previous step's output? + YES → Is it strictly linear? + YES → pipeline + NO → dag (parallel where possible) + +Does a coordinator need to stay alive and adapt? + YES → Is there one level of management? + YES → hub-spoke + NO → hierarchical (multi-level) + +Is the task about making a decision? + YES → Do agents need to argue opposing sides? + YES → debate (adversarial) + NO → consensus (cooperative voting) + +Does the right specialist emerge during processing? + YES → handoff (dynamic routing) + +Do all agents need to freely collaborate? + YES → mesh (peer-to-peer) + +Is cost the primary concern? + YES → cascade (cheap model first, escalate if needed) +``` + +## Pattern Reference + +| # | Pattern | Topology | Agents | Best For | +|---|---------|----------|--------|----------| +| 1 | **fan-out** | Star (SDK center) | N parallel | Independent subtasks (reviews, research, tests) | +| 2 | **pipeline** | Linear chain | Sequential | Ordered stages (design → implement → test) | +| 3 | **hub-spoke** | Star (live hub) | 1 lead + N workers | Dynamic coordination, lead reviews/adjusts | +| 4 | **consensus** | Broadcast + vote | N voters | Architecture decisions, approval gates | +| 5 | **mesh** | Fully connected | N peers | Brainstorming, collaborative debugging | +| 6 | **handoff** | Routing chain | 1 active at a time | Triage, specialist routing, support flows | +| 7 | **cascade** | Tiered escalation | Cheapest → most capable | Cost optimization, production workloads | +| 8 | **dag** | Dependency graph | Parallel + joins | Complex projects with mixed dependencies | +| 9 | **debate** | Adversarial rounds | 2+ debaters + judge | Rigorous evaluation, architecture trade-offs | +| 10 | **hierarchical** | Tree (multi-level) | Lead → coordinators → workers | Large teams, domain separation | + +## Pattern Details + +### 1. fan-out — Parallel Workers +```ts +fanOut([ + { task: "Review auth.ts", name: "AuthReviewer" }, + { task: "Review db.ts", name: "DbReviewer" }, +], { cli: "claude" }); +``` +- Workers run independently, no inter-agent communication +- SDK collects all DONE messages +- Use when: tasks are embarrassingly parallel + +### 2. pipeline — Sequential Stages +```ts +pipeline([ + { task: "Design the API schema", name: "Designer" }, + { task: "Implement the endpoints", name: "Implementer" }, + { task: "Write integration tests", name: "Tester" }, +]); +``` +- Stage N+1 receives Stage N's DONE summary as context +- Pipeline halts on failure +- Use when: clear linear dependency chain + +### 3. hub-spoke — Persistent Coordinator +```ts +hubAndSpoke({ + hub: { task: "Coordinate building a REST API", name: "Lead" }, + workers: [ + { task: "Build database models", name: "DbWorker" }, + { task: "Build route handlers", name: "ApiWorker" }, + ], +}); +``` +- Hub stays alive, receives ACK/DONE from workers +- Hub can spawn additional workers dynamically +- Use when: lead needs to review, adjust, and make decisions + +### 4. consensus — Cooperative Voting +```ts +consensus({ + proposal: "Should we migrate to Fastify?", + voters: [ + { task: "Evaluate performance", name: "PerfExpert" }, + { task: "Evaluate DX", name: "DxExpert" }, + ], + consensusType: "majority", +}); +``` +- Agents independently evaluate, then VOTE: approve/reject +- Supports majority, supermajority, unanimous, weighted, quorum +- Use when: need a decision with diverse perspectives + +### 5. mesh — Peer Collaboration +```ts +mesh({ + goal: "Debug the auth flow returning 500", + agents: [ + { task: "Check server logs", name: "LogAnalyst" }, + { task: "Review auth code", name: "CodeReviewer" }, + { task: "Write repro test", name: "Tester" }, + ], +}); +``` +- All agents on same channel, free communication +- Round tracking detects stalls +- Use when: collaborative exploration without hierarchy + +### 6. handoff — Dynamic Routing +```ts +handoff({ + entryPoint: { task: "Triage the request", name: "Triage" }, + routes: [ + { agent: { task: "Handle billing", name: "Billing" }, condition: "billing, payment" }, + { agent: { task: "Handle tech issues", name: "TechSupport" }, condition: "error, bug" }, + ], + maxHandoffs: 3, +}); +``` +- One active agent at a time; transfers control dynamically +- Circuit breaker prevents infinite routing loops +- Use when: right specialist isn't known upfront + +### 7. cascade — Cost-Aware Escalation +```ts +cascade({ + tiers: [ + { agent: { task: "Answer this", cli: "claude" }, confidenceThreshold: 0.7, costWeight: 1 }, + { agent: { task: "Answer this", cli: "claude" }, confidenceThreshold: 0.85, costWeight: 5 }, + { agent: { task: "Answer this", cli: "claude" }, costWeight: 20 }, + ], +}); +``` +- Start cheap, escalate if confidence < threshold +- Agent reports: `DONE [confidence=0.4]: ` +- Use when: most tasks are simple, some need heavy reasoning + +### 8. dag — Directed Acyclic Graph +```ts +dag({ + nodes: [ + { id: "scaffold", task: "Create project scaffold" }, + { id: "frontend", task: "Build React UI", dependsOn: ["scaffold"] }, + { id: "backend", task: "Build API", dependsOn: ["scaffold"] }, + { id: "integrate", task: "Wire together", dependsOn: ["frontend", "backend"] }, + ], + maxConcurrency: 3, +}); +``` +- Topological sort determines execution order +- Independent nodes run in parallel +- Use when: pipeline is too linear, fan-out is too flat + +### 9. debate — Adversarial Refinement +```ts +debate({ + topic: "Monorepo vs polyrepo for the new platform?", + debaters: [ + { task: "Argue for monorepo", position: "monorepo" }, + { task: "Argue for polyrepo", position: "polyrepo" }, + ], + judge: { task: "Judge and decide", name: "ArchJudge" }, + maxRounds: 3, +}); +``` +- Structured rounds: ARGUMENT → counterargument → VERDICT +- Optional judge; without judge, agents self-converge or split +- Use when: need rigorous adversarial examination + +### 10. hierarchical — Multi-Level Delegation +```ts +hierarchical({ + agents: [ + { id: "lead", task: "Coordinate full-stack app", role: "lead" }, + { id: "fe-coord", task: "Manage frontend", role: "coordinator", reportsTo: "lead" }, + { id: "be-coord", task: "Manage backend", role: "coordinator", reportsTo: "lead" }, + { id: "fe-dev", task: "Build components", role: "worker", reportsTo: "fe-coord" }, + { id: "be-dev", task: "Build API", role: "worker", reportsTo: "be-coord" }, + ], +}); +``` +- Workers → coordinators → lead (multi-level reporting) +- Coordinators synthesize sub-team output +- Use when: too many workers for one hub to manage + +## Reflection Protocol + +All patterns support reflection — periodic synthesis that enables course correction. Enabled via `reflectionThreshold` on WorkflowOptions. + +```ts +{ + reflectionThreshold: 10, // trigger after 10 agent messages + onReflect: async (ctx) => { + // Examine ctx.recentMessages, ctx.agentStatuses + // Return adjustments or null + }, +} +``` + +Reflection is event-driven (importance-weighted accumulation), not timer-based. See WORKFLOWS_SPEC.md for full details. + +## Common Mistakes + +| Mistake | Why It Fails | Fix | +|---------|-------------|-----| +| Using mesh for everything | O(n^2) communication, debugging nightmare | Use hub-spoke for most tasks | +| Pipeline for independent work | Sequential bottleneck | Use fan-out or dag | +| Hub-spoke for simple parallel tasks | Hub is unnecessary overhead | Use fan-out | +| Consensus for non-decisions | Voting on implementation tasks wastes time | Use hub-spoke, let lead decide | +| No circuit breaker on handoff | Infinite routing loops | Always set maxHandoffs | +| Cascade without confidence parsing | Agents don't report confidence | Convention injection handles this | +| Hierarchical for 3 agents | Management overhead exceeds benefit | Use hub-spoke for small teams | + +## DAG Executor — Proven Pattern + +The recommended architecture for DAG workflow execution, validated on a 9-node / 5-wave production run. + +### Agent Completion: Detect → Release → Collect + +**This is the critical pattern.** Claude Code agents don't auto-exit — the orchestrator must detect completion and release them. + +``` +Agent writes summary file → Orchestrator polls (5s) → Detects new mtime → + Reads summary → Calls client.release(agent) → agent_exited fires → Node marked complete +``` + +**Implementation:** +```ts +// Track initial mtime to distinguish new writes from stale files +let initialMtime = 0; +try { initialMtime = statSync(summaryPath).mtimeMs; } catch {} + +// Poll for summary file every 5s +const poll = setInterval(() => { + const stat = statSync(summaryPath); + if (stat.mtimeMs > initialMtime) { + const content = readFileSync(summaryPath, "utf-8").trim(); + await client.release(agentName); // triggers agent_exited + finish("completed", content); + } +}, 5_000); +``` + +**Convention injection tells agents to:** +1. Send summary via **Relaycast MCP** (`mcp__relaycast__send` to channel) for inter-agent communication +2. Write summary to `.relay/summaries/{nodeId}.md` as the completion signal +3. Include file paths, type names, method signatures — downstream agents depend on this + +### Communication: Relaycast MCP + +Agents communicate through the Relaycast MCP, not file-based protocols: +- **Channel messages:** `mcp__relaycast__send` with channel name +- **Direct messages:** `mcp__relaycast__dm` with agent name +- Claude Code agents inherit `.mcp.json` config and have full MCP access +- Other CLIs (codex, aider) may not have MCP — use summary files as fallback + +### State & Resume + +Persist state after every node completion for crash recovery: +```ts +saveState(completed, depsOutput, results, startTime); +// Restart with --resume to skip completed nodes +``` + +**Pitfall:** When resuming, only load `completed` nodes — never load `failed` entries, or downstream will be permanently blocked. + +### Pitfalls Reference + +| Category | Pitfall | Fix | +|----------|---------|-----| +| **Completion** | Waiting for `agent_exited` without releasing — agents idle until timeout | Poll for summary file, release agent when detected | +| **Completion** | No resolved guard — poll interval and timeout both fire, double-resolve | `resolved` boolean flag checked before every resolve | +| **Signals** | PTY prompt echo matches signal keywords (`DONE:`, `ERROR:`) causing false completion | Never put signal keywords in task prompts; use file-based signals | +| **Summaries** | Thin summaries ("Created types") useless for downstream agents | Convention injection requires file paths, signatures, key exports | +| **Execution** | `Promise.race` in batch — one success masks later failures | `Promise.allSettled` for each batch | +| **Resilience** | No `--resume` — orchestrator crash loses all progress | Persist completed set + depsOutput after each node | +| **Resilience** | No downstream failure propagation — dependents stuck in limbo | Mark all transitive dependents as "blocked" on failure | +| **Convention** | Agents don't read existing code — output doesn't match project patterns | `readFirst` field per node, included in convention injection | +| **Capabilities** | Assuming all CLIs have MCP tools — codex/aider may not | Check CLI capabilities; use summary files as fallback for non-Claude CLIs | +| **Infrastructure** | Rust broker vs Node.js CLI binary confusion (same name, different behavior) | Always set explicit `binaryPath`; use unique broker names to avoid 409 conflicts | +| **Infrastructure** | `getLogs()` assumes Node.js daemon log files — Rust broker doesn't write them | Use broker events or summary files, not log file polling | + +## YAML Workflow Definition + +Any pattern can be defined in YAML for portability: + +```yaml +version: "1.0" +name: feature-dev +pattern: hub-spoke +agents: + - id: lead + role: lead + cli: claude + - id: developer + role: worker + cli: codex + reportsTo: lead +steps: + - id: plan + agent: lead + prompt: "Create a development plan for: {{task}}" + expects: "PLAN_COMPLETE" + - id: implement + agent: developer + dependsOn: [plan] + prompt: "Implement: {{steps.plan.output}}" + expects: "DONE" +reflection: + enabled: true + threshold: 10 +trajectory: + enabled: true +``` + +Store in `.relay/workflows/` and run with: +```ts +const workflow = await loadWorkflow(".relay/workflows/feature-dev.yaml"); +const run = runWorkflow(workflow, "Add user authentication"); +``` diff --git a/.claude/skills/relay-80-100-workflow/SKILL.md b/.claude/skills/relay-80-100-workflow/SKILL.md new file mode 100644 index 0000000..305a0c4 --- /dev/null +++ b/.claude/skills/relay-80-100-workflow/SKILL.md @@ -0,0 +1,406 @@ +--- +name: relay-80-100-workflow +description: Use when writing agent-relay workflows that must fully validate features end-to-end before merging. Covers the 80-to-100 pattern - going beyond "code compiles" to "feature works, tested E2E locally." Includes PGlite for in-memory Postgres testing, mock sandbox patterns, test-fix-rerun loops, verify gates after every edit, and the full lifecycle from implementation through passing tests to commit. +--- + +# Writing 80-to-100 Validated Workflows + +## Overview + +Most agent workflows get features to ~80%: code written, types check, maybe a build passes. This skill covers the **80-to-100 gap** — making workflows that fully validate features end-to-end before committing. The goal: every feature merged via these workflows is **tested, verified, and known-working**, not just "it compiles." + +## When to Use + +- Writing workflows where the deliverable must be **production-ready**, not just code-complete +- Features that touch databases, APIs, or infrastructure that can be tested locally +- Any workflow where "it compiles" is not sufficient proof of correctness +- When you want confidence that the commit actually works before deploying + +## Core Principle: Test In The Workflow + +The key insight: **run tests as deterministic steps inside the workflow itself**. Don't just write test files — execute them, verify they pass, fix failures, and re-run. The workflow doesn't commit until tests are green. + +``` +implement → write tests → run tests → fix failures → re-run → build check → regression check → commit +``` + +This means the commit at the end of the workflow represents code that is **proven working**, not just code that an agent wrote and claimed works. + +## The Test-Fix-Rerun Pattern + +Every testable feature in a workflow should follow this three-step pattern: + +```typescript +// Step 1: Run tests (allow failure — we expect issues on first run) +.step('run-tests', { + type: 'deterministic', + dependsOn: ['create-tests'], + command: 'npx tsx --test tests/my-feature.test.ts 2>&1 | tail -60', + captureOutput: true, + failOnError: false, // <-- Don't fail the workflow, let the agent fix it +}) + +// Step 2: Agent reads output, fixes issues, re-runs until green +.step('fix-tests', { + agent: 'tester', + dependsOn: ['run-tests'], + task: `Check the test output and fix any failures. + +Test output: +{{steps.run-tests.output}} + +If all tests passed, do nothing. +If there are failures: +1. Read the failing test file and source files +2. Fix the issues (could be in test or source) +3. Re-run: npx tsx --test tests/my-feature.test.ts +4. Keep fixing until ALL tests pass.`, + verification: { type: 'exit_code' }, +}) + +// Step 3: Deterministic final run — this one MUST pass +.step('run-tests-final', { + type: 'deterministic', + dependsOn: ['fix-tests'], + command: 'npx tsx --test tests/my-feature.test.ts 2>&1', + captureOutput: true, + failOnError: true, // <-- Hard fail if tests still broken +}) +``` + +**Why three steps instead of one?** +- The first run captures output for the agent to diagnose +- The agent step can iterate (read errors, fix, re-run) multiple times +- The final deterministic run is the gate — no agent judgment, just pass/fail + +## PGlite: In-Memory Postgres for Database Testing + +When your feature touches the database, use **PGlite** — a WASM-based Postgres that runs in-process. No Docker, no external services, no flaky network dependencies. + +### Setup + +Install as a dev dependency in the workflow: + +```typescript +.step('install-pglite', { + type: 'deterministic', + command: 'npm install --save-dev @electric-sql/pglite 2>&1 | tail -5', + captureOutput: true, +}) +``` + +### Test Helper Pattern + +Create a reusable helper that boots an in-memory Postgres with your schema: + +```typescript +// tests/helpers/pglite-db.ts +import { PGlite } from '@electric-sql/pglite'; +import { drizzle } from 'drizzle-orm/pglite'; +import * as schema from '../../packages/web/lib/db/schema.js'; + +// Raw DDL matching your Drizzle schema — PGlite doesn't run Drizzle migrations +const MY_TABLE_DDL = ` +CREATE TABLE IF NOT EXISTS my_table ( + id UUID PRIMARY KEY DEFAULT gen_random_uuid(), + name TEXT NOT NULL, + created_at TIMESTAMPTZ NOT NULL DEFAULT NOW() +); +`; + +export async function createTestDb() { + const pg = new PGlite(); + await pg.exec(MY_TABLE_DDL); + const db = drizzle(pg, { schema }); + return { db, pg, schema, cleanup: () => pg.close() }; +} +``` + +### PGlite Gotchas + +| Issue | Fix | +|-------|-----| +| `pgcrypto` extension not available | Use `gen_random_uuid()` (built-in since PG 13) or generate UUIDs in app code | +| UUID columns | PGlite supports UUID natively — no special handling needed | +| `drizzle-orm/pglite` import | Exists since drizzle-orm 0.30+. If not found, check version. | +| Index creation | PGlite supports standard CREATE INDEX — no limitations | +| Concurrent writes | PGlite is single-connection. Test concurrent logic with sequential assertions. | + +### Test Structure + +```typescript +// tests/my-feature.test.ts +import { describe, it } from 'node:test'; +import assert from 'node:assert/strict'; +import { randomUUID } from 'node:crypto'; +import { createTestDb } from './helpers/pglite-db.js'; + +describe('my feature', () => { + it('does the thing correctly', async () => { + const { db, schema, cleanup } = await createTestDb(); + try { + // Arrange + const testId = randomUUID(); + // Act — use your module against the real (in-memory) Postgres + // Assert + assert.equal(result.name, 'expected'); + } finally { + await cleanup(); + } + }); +}); +``` + +## Verify Gates After Every Edit + +Never trust that an agent edited a file correctly. Add a deterministic verify gate after every agent edit step: + +```typescript +// Agent edits a file +.step('edit-schema', { + agent: 'impl', + dependsOn: ['read-schema'], + task: `Edit packages/web/lib/db/schema.ts...`, + verification: { type: 'exit_code' }, +}) + +// Deterministic verification — did the edit actually land? +.step('verify-schema', { + type: 'deterministic', + dependsOn: ['edit-schema'], + command: `if git diff --quiet packages/web/lib/db/schema.ts; then echo "NOT MODIFIED"; exit 1; fi +grep "my_new_table" packages/web/lib/db/schema.ts >/dev/null && echo "OK" || (echo "MISSING"; exit 1)`, + failOnError: true, + captureOutput: true, +}) +``` + +**What to verify:** +- File was actually modified (`git diff --quiet` returns non-zero) +- Key content exists (grep for table names, function names, imports) +- For new files: `file_exists` verification type + +**What NOT to verify:** +- Exact content (too brittle — agents format differently) +- Line counts or byte sizes (meaningless) + +## Mock Sandbox Pattern + +When testing code that interacts with Daytona sandboxes, use inline mock objects matching the existing test conventions: + +```typescript +const daytona = { + create: async () => ({ + id: 'sandbox-id', + process: { + executeCommand: async (cmd, cwd, env) => ({ + result: 'output', + exitCode: 0, + }), + }, + fs: { + uploadFile: async () => undefined, + }, + getUserHomeDir: async () => '/home/daytona', + }), + remove: async () => undefined, +}; +``` + +For testing that your code calls the right methods, record calls in an array: + +```typescript +const emitted: EmitEventOptions[] = []; +const mockClient: SessionEventClient = { + emit: async (opts) => { emitted.push(opts); }, + getEvents: async () => [], + getLatestSequence: async () => 0, +}; + +// ... run the code ... + +assert.equal(emitted.length, 4); +assert.equal(emitted[0].eventType, 'sandbox_created'); +``` + +## Regression Testing + +After your new tests pass, always run the **existing test suite** to catch regressions: + +```typescript +.step('run-existing-tests', { + type: 'deterministic', + dependsOn: ['fix-build'], + command: 'npm run orchestrator:test 2>&1 | tail -40', + captureOutput: true, + failOnError: false, +}) + +.step('fix-regressions', { + agent: 'impl', + dependsOn: ['run-existing-tests'], + task: `Check the full test suite for regressions caused by our changes. + +Test output: +{{steps.run-existing-tests.output}} + +If all tests passed, do nothing. +If EXISTING tests broke, read the failing test, find what we broke, fix it. +Most likely cause: constructor signatures changed, new required fields added +without defaults, or import paths shifted. + +Run: npm run orchestrator:test +Fix until all tests pass.`, + verification: { type: 'exit_code' }, +}) +``` + +## Full Workflow Template + +Here's the complete pattern for a feature that touches the database: + +```typescript +import { workflow } from '@agent-relay/sdk/workflows'; + +const result = await workflow('my-feature') + .description('Add feature X with full E2E validation') + .pattern('dag') + .channel('wf-my-feature') + .maxConcurrency(3) + .timeout(3_600_000) + + .agent('impl', { cli: 'claude', preset: 'worker', retries: 2 }) + .agent('tester', { cli: 'claude', preset: 'worker', retries: 2 }) + + // ── Phase 1: Read ──────────────────────────────────────────────── + .step('read-target', { + type: 'deterministic', + command: 'cat path/to/file.ts', + captureOutput: true, + }) + + // ── Phase 2: Implement ─────────────────────────────────────────── + .step('edit-target', { + agent: 'impl', + dependsOn: ['read-target'], + task: `Edit path/to/file.ts. Current contents: +{{steps.read-target.output}} + +Only edit this one file.`, + verification: { type: 'exit_code' }, + }) + .step('verify-target', { + type: 'deterministic', + dependsOn: ['edit-target'], + command: 'git diff --quiet path/to/file.ts && (echo "NOT MODIFIED"; exit 1) || echo "OK"', + failOnError: true, + captureOutput: true, + }) + + // ── Phase 3: Test infrastructure ───────────────────────────────── + .step('install-pglite', { + type: 'deterministic', + command: 'npm install --save-dev @electric-sql/pglite 2>&1 | tail -5', + captureOutput: true, + }) + .step('create-test-helpers', { + agent: 'tester', + dependsOn: ['install-pglite'], + task: 'Create tests/helpers/pglite-db.ts with ...', + verification: { type: 'file_exists', value: 'tests/helpers/pglite-db.ts' }, + }) + .step('create-tests', { + agent: 'tester', + dependsOn: ['create-test-helpers', 'verify-target'], + task: 'Create tests/my-feature.test.ts with ...', + verification: { type: 'file_exists', value: 'tests/my-feature.test.ts' }, + }) + + // ── Phase 4: Test-fix-rerun loop ───────────────────────────────── + .step('run-tests', { + type: 'deterministic', + dependsOn: ['create-tests'], + command: 'npx tsx --test tests/my-feature.test.ts 2>&1 | tail -60', + captureOutput: true, + failOnError: false, + }) + .step('fix-tests', { + agent: 'tester', + dependsOn: ['run-tests'], + task: `Fix any test failures. Output:\n{{steps.run-tests.output}}`, + verification: { type: 'exit_code' }, + }) + .step('run-tests-final', { + type: 'deterministic', + dependsOn: ['fix-tests'], + command: 'npx tsx --test tests/my-feature.test.ts 2>&1', + captureOutput: true, + failOnError: true, + }) + + // ── Phase 5: Build + regression ────────────────────────────────── + .step('build-check', { + type: 'deterministic', + dependsOn: ['run-tests-final'], + command: 'npx tsc --noEmit 2>&1 | tail -20; echo "EXIT: $?"', + captureOutput: true, + failOnError: false, + }) + .step('fix-build', { + agent: 'impl', + dependsOn: ['build-check'], + task: `Fix type errors if any. Output:\n{{steps.build-check.output}}`, + verification: { type: 'exit_code' }, + }) + .step('run-existing-tests', { + type: 'deterministic', + dependsOn: ['fix-build'], + command: 'npm test 2>&1 | tail -40', + captureOutput: true, + failOnError: false, + }) + .step('fix-regressions', { + agent: 'impl', + dependsOn: ['run-existing-tests'], + task: `Fix regressions if any. Output:\n{{steps.run-existing-tests.output}}`, + verification: { type: 'exit_code' }, + }) + + // ── Phase 6: Commit ────────────────────────────────────────────── + .step('commit', { + type: 'deterministic', + dependsOn: ['fix-regressions'], + command: 'git add && git commit -m "feat: ..."', + captureOutput: true, + failOnError: true, + }) + + .onError('retry', { maxRetries: 2, retryDelayMs: 10_000 }) + .run({ cwd: process.cwd() }); +``` + +## Checklist: Is Your Workflow 80-to-100? + +| Check | How | +|-------|-----| +| Tests exist | `file_exists` verification on test file | +| Tests actually run | Deterministic step executes them | +| Test failures get fixed | Agent step reads output, fixes, re-runs | +| Final test run is hard-gated | `failOnError: true` on last test step | +| Build passes | `npx tsc --noEmit` deterministic step | +| No regressions | Existing test suite runs after changes | +| Every edit is verified | `git diff --quiet` + grep after each agent edit | +| Commit only happens after all gates | `dependsOn` chains to final verification | + +## Common Anti-Patterns + +| Anti-pattern | Why it fails | Fix | +|-------------|-------------|-----| +| Tests written but never executed | Agent claims they pass, they don't | Add deterministic `run-tests` step | +| Single `failOnError: true` test run | First failure kills workflow, no chance to fix | Use the three-step test-fix-rerun pattern | +| No regression test | New feature works, old features break | Run `npm test` after build check | +| Agent asked to "write and run tests" in one step | Agent writes tests, runs them, they fail, it edits, output is garbled | Separate write/run/fix into distinct steps | +| PGlite DDL doesn't match Drizzle schema | Tests pass on wrong schema | Derive DDL from schema.ts or test with real migration | +| `failOnError: false` on final test run | Broken tests get committed | Always `failOnError: true` on the gate step | +| Testing only happy path | Edge cases break in prod | Specify edge case tests in the task prompt | +| No verify gate after agent edits | Agent exits 0 without writing anything | Add `git diff --quiet` check after every edit | diff --git a/.claude/skills/writing-agent-relay-workflows/SKILL.md b/.claude/skills/writing-agent-relay-workflows/SKILL.md new file mode 100644 index 0000000..254d3d8 --- /dev/null +++ b/.claude/skills/writing-agent-relay-workflows/SKILL.md @@ -0,0 +1,832 @@ +--- +name: writing-agent-relay-workflows +description: Use when building multi-agent workflows with the relay broker-sdk - covers the WorkflowBuilder API, DAG step dependencies, agent definitions, step output chaining via {{steps.X.output}}, verification gates, evidence-based completion, owner decisions, dedicated channels, dynamic channel management (subscribe/unsubscribe/mute/unmute), swarm patterns, error handling, event listeners, step sizing rules, authoring best practices, and the lead+workers team pattern for complex steps +--- + +# Writing Agent Relay Workflows + +## Overview + +The relay broker-sdk workflow system orchestrates multiple AI agents (Claude, Codex, Gemini, Aider, Goose) through typed DAG-based workflows. Workflows can be written in **TypeScript** (preferred), **Python**, or **YAML**. + +**Language preference:** TypeScript > Python > YAML. Use TypeScript unless the project is Python-only or a simple config-driven workflow suits YAML. + +**Pattern selection:** Do not default to `dag` blindly. If the job needs a different swarm/workflow type, consult the `choosing-swarm-patterns` skill when available and select the pattern that best matches the coordination problem. + +## When to Use + +- Building multi-agent workflows with step dependencies +- Orchestrating different AI CLIs (claude, codex, gemini, aider, goose) +- Creating DAG, pipeline, fan-out, or other swarm patterns +- Needing verification gates, retries, or step output chaining +- Dynamic channel management: agents joining/leaving/muting channels mid-workflow + +## Quick Reference + +```typescript +import { workflow } from '@agent-relay/sdk/workflows'; + +const result = await workflow('my-workflow') + .description('What this workflow does') + .pattern('dag') // or 'pipeline', 'fan-out', etc. + .channel('wf-my-workflow') // dedicated channel (auto-generated if omitted) + .maxConcurrency(3) + .timeout(3_600_000) // global timeout (ms) + + .agent('lead', { cli: 'claude', role: 'Architect', retries: 2 }) + .agent('worker', { cli: 'codex', role: 'Implementer', retries: 2 }) + + .step('plan', { + agent: 'lead', + task: `Analyze the codebase and produce a plan.`, + retries: 2, + verification: { type: 'output_contains', value: 'PLAN_COMPLETE' }, + }) + .step('implement', { + agent: 'worker', + task: `Implement based on this plan:\n{{steps.plan.output}}`, + dependsOn: ['plan'], + verification: { type: 'exit_code' }, + }) + + .onError('retry', { maxRetries: 2, retryDelayMs: 10_000 }) + .run({ cwd: process.cwd() }); + + console.log('Result:', result.status); +``` + +**Critical TypeScript rules:** +1. Check the project's `package.json` for `"type": "module"` — if ESM, use `import` and top-level `await`. If CJS, use `require()` and wrap in `async function main()`. +2. `agent-relay run ` executes the file as a standalone subprocess — it does NOT inspect exports. The file MUST call `.run()`. +3. Use `.run({ cwd: process.cwd() })` — `createWorkflowRenderer` does not exist +4. Validate with `--dry-run` before running: `agent-relay run --dry-run workflow.ts` + +## ⚡ Parallelism — Design for Speed + +**This is the most important design consideration.** Sequential workflows waste hours. Always design for maximum parallelism. + +### Cross-Workflow Parallelism: Wave Planning + +When a project has multiple workflows, group independent ones into parallel waves: + +```bash +# BAD — sequential (14 hours for 27 workflows at ~30 min each) +agent-relay run workflows/34-sst-wiring.ts +agent-relay run workflows/35-env-config.ts +agent-relay run workflows/36-loading-states.ts +# ... one at a time + +# GOOD — parallel waves (3-4 hours for 27 workflows) +# Wave 1: independent infra (parallel) +agent-relay run workflows/34-sst-wiring.ts & +agent-relay run workflows/35-env-config.ts & +agent-relay run workflows/36-loading-states.ts & +agent-relay run workflows/37-responsive.ts & +wait +git add -A && git commit -m "Wave 1" + +# Wave 2: testing (parallel — independent test suites) +agent-relay run workflows/40-unit-tests.ts & +agent-relay run workflows/41-integration-tests.ts & +agent-relay run workflows/42-e2e-tests.ts & +wait +git add -A && git commit -m "Wave 2" +``` + +### Wave Planning Heuristics + +Two workflows can run in parallel if they don't have write-write or write-read file conflicts: + +| Touch Zone | Can Parallelize? | +|---|---| +| Different `packages/*/src/` dirs | ✅ Yes | +| Different `app/` routes | ✅ Yes | +| Same package, different subdirs | ⚠️ Usually yes | +| Same files (shared config, root package.json) | ❌ No — sequential or same wave with merge | +| Explicit dependency | ❌ No — ordered waves | + +### Declare File Scope for Planning + +Help wave planners (human or automated) understand what each workflow touches: + +```typescript +workflow('48-comparison-mode') + .packages(['web', 'core']) // monorepo packages touched + .isolatedFrom(['49-feedback-system']) // explicitly safe to parallelize + .requiresBefore(['46-admin-dashboard']) // explicit ordering constraint +``` + +### Within-Workflow Parallelism + +Use shared `dependsOn` to fan out independent sub-tasks: + +```typescript +// BAD — unnecessary sequential chain +.step('fix-component-a', { agent: 'worker', dependsOn: ['review'] }) +.step('fix-component-b', { agent: 'worker', dependsOn: ['fix-component-a'] }) // why wait? + +// GOOD — parallel fan-out, merge at the end +.step('fix-component-a', { agent: 'impl-1', dependsOn: ['review'] }) +.step('fix-component-b', { agent: 'impl-2', dependsOn: ['review'] }) // same dep = parallel +.step('verify-all', { agent: 'reviewer', dependsOn: ['fix-component-a', 'fix-component-b'] }) +``` + +### Impact + +Real-world example (Relayed — 60 workflows): +- **Sequential**: ~30 min × 60 = **30 hours** +- **Parallel waves (4-6 per wave)**: ~12 waves × 35 min = **~7 hours** (4x faster) +- **Aggressive parallelism (8-way)**: **~4 hours** (7.5x faster) + +--- +## Failure Prevention + +These workflow files are easy to break in ways that only appear mid-run. Follow these rules when authoring or editing workflow `.ts` files. + +### 1. Do not use raw top-level `await` + +Executor-driven workflow files may be run through a `tsx`/`esbuild` path that behaves like CJS. Raw top-level `await` can fail with: + +- `Top-level await is currently not supported with the "cjs" output format` + +Always wrap execution like this: + +```ts +async function runWorkflow() { + const result = await workflow('my-workflow') + // ... + .run({ cwd: process.cwd() }); + + console.log('Workflow status:', result.status); +} + +runWorkflow().catch((error) => { + console.error(error); + process.exit(1); +}); +``` + +Do not end workflow files with bare top-level `await workflow(...).run(...)`. + +### 2. Avoid raw fenced code blocks inside workflow task template literals + +Raw triple-backtick code fences inside large inline `task: \`...\`` template strings are fragile and can break outer TypeScript parsing, especially when they contain language tags like `swift` or `diff`. + +Preferred options, in order: + +1. Avoid inline fenced examples entirely +2. Move larger examples to referenced files +3. Use plain indented examples instead of fenced blocks +4. If fenced blocks must exist inside generated inner code, escape them consistently and syntax-check the outer workflow file afterward + +### 3. Keep final verification boring and deterministic + +Final verification should validate real outputs with simple, portable shell commands. If checking for multiple symbols, use extended regex explicitly: + +```bash +grep -Eq "foo|bar|baz" file.ts +``` + +Do **not** rely on basic `grep` alternation like: + +```bash +grep -c "foo\|bar\|baz" file.ts +``` + +That can silently misbehave and create fake failures even when the generated code is correct. + +### 4. Separate durable outputs from execution exhaust + +Commit: + +- generated product code +- migrations +- tests +- docs +- workflow-definition fixes + +Do not commit by default: + +- `.logs/` +- transient executor output +- retry artifacts +- temporary step-output files + +### 5. Prefer Codex for implementation-heavy roles and Claude for review + +Default team split for workflow-authored agent roles: + +- **lead / implementer / writer / fixer** → `codex` +- **reviewer** → `claude` + +Use Claude as the primary implementer only when there is a specific reason. + +### 6. Be explicit about shell requirements + +If executor scripts use Bash-only features such as associative arrays, require modern Bash explicitly. On macOS, prefer a known-good Bash path when needed, for example: + +```bash +/opt/homebrew/bin/bash workflows/your-workflow/execute.sh --wave 2 +``` + +### 7. Make resume semantics explicit + +Document clearly whether the executor supports: + +- full-run continuation +- `--wave` +- `--workflow` +- `--resume` + +Do not assume users will infer the behavior. In particular, `--wave N` should be understood as "run only this wave" unless the executor explicitly chains onward. + +### 8. Syntax-check workflow files after editing + +After editing workflow `.ts` files, run a lightweight syntax check before launching a large batch run. This is especially important if the workflow contains: + +- large inline `task` template literals +- embedded code examples +- escaped backticks +- wrapper changes around workflow execution + +--- + +## End-to-End Bug Fix Workflows + +For bug-fix or reliability workflows, do **not** stop at unit or integration tests. The workflow should explicitly prove that the original user-visible problem is fixed. + +### Required phases for fix workflows + +1. **Capture the original failure** + - Reproduce the bug first in a deterministic or evidence-capturing step + - Save exact commands, logs, status codes, or screenshots/artifacts +2. **State the acceptance contract** + - Define the exact end-to-end success criteria before implementation + - Include the real entrypoint a user would run +3. **Implement the fix** +4. **Rebuild / reinstall from scratch** + - Do not trust dirty local state + - Prefer a clean environment when install/bootstrap behavior is involved +5. **Run targeted regression checks** + - Unit/integration tests are helpful but not sufficient by themselves +6. **Run a full end-to-end validation** + - Use the real CLI / API / install path + - Prefer a clean environment (Docker, sandbox, cloud workspace, Daytona, etc.) for install/runtime issues +7. **Compare before vs after evidence** + - Show that the original failure no longer occurs +8. **Record residual risks** + - Call out what was not covered + +### Clean-environment validation guidance + +When the bug involves install, bootstrap, PATH/shims, auth, brokers, background services, OS-specific packaging, or first-run UX, add a second workflow (or second phase) that validates the fix in a **fresh environment**. + +Preferred order of proving environments: +1. disposable sandbox / cloud workspace +2. Docker / containerized environment +3. fresh local shell with isolated paths + +### Meta-workflow guidance + +If the right proving environment is unclear, first write a **meta-workflow** that: +- compares candidate validation environments +- defines the acceptance contract +- chooses the best swarm pattern +- then authors the final fix/validation workflow + +This is often better than jumping straight to implementation. + +## Key Concepts + +### Step Output Chaining + +Use `{{steps.STEP_NAME.output}}` in a downstream step's task to inject the prior step's terminal output. + +**Only chain output from clean sources:** +- Deterministic steps (shell commands — always clean) +- Non-interactive agents (`preset: 'worker'` — clean stdout) + +**Never chain from interactive agents** (`cli: 'claude'` without preset) — PTY output includes spinners, ANSI codes, and TUI chrome. Instead, have the agent write to a file, then read it in a deterministic step. + +### Verification Gates + +```typescript +verification: { type: 'exit_code' } // preferred for code-editing steps +verification: { type: 'output_contains', value: 'DONE' } // optional accelerator +verification: { type: 'file_exists', value: 'src/out.ts' } // deterministic file check +``` + +Only these four types are valid: `exit_code`, `output_contains`, `file_exists`, `custom`. Invalid types are silently ignored and fall through to process-exit auto-pass. + +**Verification token gotcha:** If the token appears in the task text, the runner requires it **twice** in output (once from task echo, once from agent). Prefer `exit_code` for code-editing steps to avoid this. + +### DAG Dependencies + +Steps with `dependsOn` wait for all listed steps. Steps with no dependencies start immediately. Steps sharing the same `dependsOn` run in parallel: + +```typescript +.step('fix-types', { agent: 'worker', dependsOn: ['review'], ... }) +.step('fix-tests', { agent: 'worker', dependsOn: ['review'], ... }) +.step('final', { agent: 'lead', dependsOn: ['fix-types', 'fix-tests'], ... }) +``` + +### Self-Termination + +Do NOT add exit instructions to task strings. The runner handles this automatically. + +### Step Completion Model + +Steps complete through a multi-signal pipeline (highest priority first): + +1. **Deterministic verification** — `exit_code`, `file_exists`, `output_contains` pass → immediate completion +2. **Owner decision** — `OWNER_DECISION: COMPLETE|INCOMPLETE_RETRY|INCOMPLETE_FAIL` +3. **Evidence-based** — channel signals, file artifacts, clean exit code +4. **Marker fast-path** — `STEP_COMPLETE:` (optional accelerator) +5. **Process-exit fallback** — agent exits 0 with no signals → completes after grace period + +**Key principle:** No single signal is mandatory. Describe the deliverable, not what to print. + +### Dynamic Channel Management + +Agents can dynamically subscribe, unsubscribe, mute, and unmute channels **after spawn**. This eliminates the need for client-side channel filtering and manual peer fanout. + +#### SDK API + +```typescript +// Subscribe an agent to additional channels post-spawn +relay.subscribe({ agent: 'security-auditor', channels: ['review-pr-456'] }); + +// Unsubscribe — agent leaves the channel entirely +relay.unsubscribe({ agent: 'security-auditor', channels: ['general'] }); + +// Mute — agent stays subscribed (history access) but messages are NOT injected into PTY +relay.mute({ agent: 'security-auditor', channel: 'review-pr-123' }); + +// Unmute — resume PTY injection +relay.unmute({ agent: 'security-auditor', channel: 'review-pr-123' }); +``` + +Agent-level methods are also available: + +```typescript +const agent = await relay.claude.spawn({ name: 'auditor', channels: ['ch-a'] }); +await agent.subscribe(['ch-b']); // now subscribed to ch-a and ch-b +await agent.mute('ch-a'); // ch-a messages silenced (still in history) +await agent.unmute('ch-a'); // ch-a messages resume +await agent.unsubscribe(['ch-b']); // leaves ch-b +console.log(agent.channels); // ['ch-a'] +console.log(agent.mutedChannels); // [] +``` + +#### Semantics + +| Operation | Channel membership | PTY injection | History access | +|---------------|-------------------|---------------|----------------| +| `subscribe` | Yes | Yes | Yes | +| `unsubscribe` | No | No | No (leaves) | +| `mute` | Yes (stays) | No (silenced) | Yes (can query)| +| `unmute` | Yes | Yes (resumes) | Yes | + +#### Events + +```typescript +relay.onChannelSubscribed = (agent, channels) => { /* ... */ }; +relay.onChannelUnsubscribed = (agent, channels) => { /* ... */ }; +relay.onChannelMuted = (agent, channel) => { /* ... */ }; +relay.onChannelUnmuted = (agent, channel) => { /* ... */ }; +``` + +#### When to Use in Workflows + +- **Multi-PR chat sessions**: Agents focused on one PR can mute other PR channels to reduce noise +- **Phase transitions**: Subscribe agents to new channels as work progresses between phases +- **Team isolation**: Workers mute the main coordination channel during focused work, unmute for review +- **Dynamic fanout**: A lead subscribes workers to sub-channels at runtime based on task decomposition + +#### What This Eliminates + +With broker-managed subscriptions, you no longer need: +1. Client-side persona filtering (`personaNames.has(from)` checks) +2. Channel prefix regex for message routing +3. Manual peer fanout (iterating agents to forward messages) +4. Dedup caches for dual-path delivery + +## Agent Definition + +```typescript +.agent('name', { + cli: 'claude' | 'codex' | 'gemini' | 'aider' | 'goose' | 'opencode' | 'droid', + role?: string, + preset?: 'lead' | 'worker' | 'reviewer' | 'analyst', + retries?: number, + model?: string, + interactive?: boolean, // default: true +}) +``` + +### Model Constants + +**Always use model constants from `@agent-relay/config` instead of string literals.** Each CLI has a typed constants object with its available models: + +```typescript +import { ClaudeModels, CodexModels, GeminiModels } from '@agent-relay/config'; + +.agent('planner', { cli: 'claude', model: ClaudeModels.OPUS }) // not 'opus' +.agent('worker', { cli: 'claude', model: ClaudeModels.SONNET }) // not 'sonnet' +.agent('coder', { cli: 'codex', model: CodexModels.GPT_5_4 }) // not 'gpt-5.4' +``` + +**Post-spawn channel operations** (available on Agent instances and AgentRelay facade): + +```typescript +// Agent instance methods +agent.subscribe(channels: string[]): Promise +agent.unsubscribe(channels: string[]): Promise +agent.mute(channel: string): Promise +agent.unmute(channel: string): Promise +agent.channels: string[] // current subscribed channels +agent.mutedChannels: string[] // currently muted channels + +// AgentRelay facade methods (by agent name) +relay.subscribe({ agent: string, channels: string[] }): Promise +relay.unsubscribe({ agent: string, channels: string[] }): Promise +relay.mute({ agent: string, channel: string }): Promise +relay.unmute({ agent: string, channel: string }): Promise +``` + +| Preset | Interactive | Relay access | Use for | +| ---------- | ------------- | ------------ | ---------------------------------------------------- | +| `lead` | yes (PTY) | yes | Coordination, monitoring channels | +| `worker` | no (subprocess) | no | Bounded tasks, structured stdout | +| `reviewer` | no (subprocess) | no | Reading artifacts, producing verdicts | +| `analyst` | no (subprocess) | no | Reading code/files, writing findings | + +Non-interactive presets run via one-shot mode (`claude -p`, `codex exec`). Output is clean and available via `{{steps.X.output}}`. + +**Critical rule:** Pre-inject content into non-interactive agents. Don't ask them to read large files — pre-read in a deterministic step and inject via `{{steps.X.output}}`. + +## Step Definition + +### Agent Steps + +```typescript +.step('name', { + agent: string, + task: string, // supports {{var}} and {{steps.NAME.output}} + dependsOn?: string[], + verification?: VerificationCheck, + retries?: number, +}) +``` + +### Deterministic Steps (Shell Commands) + +```typescript +.step('verify-files', { + type: 'deterministic', + command: 'test -f src/auth.ts && echo "FILE_EXISTS"', + dependsOn: ['implement'], + captureOutput: true, + failOnError: true, +}) +``` + +Use for: file checks, reading files for injection, build/test gates, git operations. + +## Common Patterns + +### Interactive Team (lead + workers on shared channel) + +When a task involves creating/modifying multiple files with review feedback, use **interactive agents on a shared channel** instead of non-interactive one-shot workers. The lead coordinates, reviews, and posts feedback; workers implement and iterate. + +```typescript +.agent('lead', { + cli: 'claude', + model: ClaudeModels.OPUS, + role: 'Architect and reviewer — assigns work, reviews, posts feedback', + retries: 1, + // No preset — interactive by default +}) + +.agent('impl-new', { + cli: 'codex', + model: CodexModels.O3, + role: 'Creates new files. Listens on channel for assignments and feedback.', + retries: 2, + // No preset — interactive, receives channel messages +}) + +.agent('impl-modify', { + cli: 'codex', + model: CodexModels.O3, + role: 'Edits existing files. Listens on channel for assignments and feedback.', + retries: 2, +}) + +// All three share the same dependsOn — they start concurrently (no deadlock) +.step('lead-coordinate', { + agent: 'lead', + dependsOn: ['context'], + task: `You are the lead on #channel. Workers: impl-new, impl-modify. +Post the plan. Assign files. Review their work. Post feedback if needed. +Workers iterate based on your feedback. Exit when all files are correct.`, +}) +.step('impl-new-work', { + agent: 'impl-new', + dependsOn: ['context'], // same dep as lead = parallel start + task: `You are impl-new on #channel. Wait for the lead's plan. +Create files as assigned. Report completion. Fix issues from feedback.`, +}) +.step('impl-modify-work', { + agent: 'impl-modify', + dependsOn: ['context'], // same dep as lead = parallel start + task: `You are impl-modify on #channel. Wait for the lead's plan. +Edit files as assigned. Report completion. Fix issues from feedback.`, +}) +// Downstream gates on lead (lead exits when satisfied) +.step('verify', { type: 'deterministic', dependsOn: ['lead-coordinate'], ... }) +``` + +**Key behaviors observed in production:** + +- **Workers self-organize from channel context.** Workers read each other's completion messages and start dependent work without waiting for the lead to relay. The shared channel gives them ambient awareness. +- **Lead-as-reviewer is more efficient than a separate reviewer agent.** The lead reads actual files and runs typecheck between rounds — one agent doing coordination + review eliminates a step. +- **Codex interactive mode works well with PTY channel injection.** Don't default to `preset: 'worker'` — interactive Codex agents receive and act on channel messages reliably. +- **Workers may outpace the lead.** If the lead is reviewing while workers are fast, the lead's "proceed" message may arrive after the worker already started from channel context. This is harmless but worth knowing. +- **No feedback loop needed = fast path.** If workers get it right first try, the interactive pattern completes just as fast as one-shot. The feedback loop is insurance, not overhead. + +**When to use interactive team vs one-shot DAG:** + +| Scenario | Pattern | +|----------|---------| +| 4+ files, likely needs iteration | Interactive team | +| Simple edits, well-specified | One-shot DAG with `preset: 'worker'` | +| Cross-agent review feedback loop | Interactive team | +| Independent tasks, no coordination | Fan-out with non-interactive workers | + +### Pipeline (sequential handoff) + +```typescript +.pattern('pipeline') +.step('analyze', { agent: 'analyst', task: '...' }) +.step('implement', { agent: 'dev', task: '{{steps.analyze.output}}', dependsOn: ['analyze'] }) +.step('test', { agent: 'tester', task: '{{steps.implement.output}}', dependsOn: ['implement'] }) +``` + +### Error Handling + +```typescript +.onError('fail-fast') // stop on first failure (default) +.onError('continue') // skip failed branches, continue others +.onError('retry', { maxRetries: 3, retryDelayMs: 5000 }) +``` + +## Multi-File Edit Pattern + +When a workflow needs to modify multiple existing files, **use one agent step per file** with a deterministic verify gate after each. Agents reliably edit 1-2 files per step but fail on 4+. + +```yaml +steps: + - name: read-types + type: deterministic + command: cat src/types.ts + captureOutput: true + + - name: edit-types + agent: dev + dependsOn: [read-types] + task: | + Edit src/types.ts. Current contents: + {{steps.read-types.output}} + Add 'pending' to the Status union type. + Only edit this one file. + verification: + type: exit_code + + - name: verify-types + type: deterministic + dependsOn: [edit-types] + command: 'if git diff --quiet src/types.ts; then echo "NOT MODIFIED"; exit 1; fi; echo "OK"' + failOnError: true + + - name: read-service + type: deterministic + dependsOn: [verify-types] + command: cat src/service.ts + captureOutput: true + + - name: edit-service + agent: dev + dependsOn: [read-service] + task: | + Edit src/service.ts. Current contents: + {{steps.read-service.output}} + Add a handlePending() method. + Only edit this one file. + verification: + type: exit_code + + - name: verify-service + type: deterministic + dependsOn: [edit-service] + command: 'if git diff --quiet src/service.ts; then echo "NOT MODIFIED"; exit 1; fi; echo "OK"' + failOnError: true + + # Deterministic commit — never rely on agents to commit + - name: commit + type: deterministic + dependsOn: [verify-service] + command: git add src/types.ts src/service.ts && git commit -m "feat: add pending status" + failOnError: true +``` + +**Key rules:** +- Read the file in a deterministic step right before the edit (not all files upfront) +- Tell the agent "Only edit this one file" to prevent it touching other files +- Verify with `git diff --quiet` after each edit — fail fast if the agent didn't write +- Always commit with a deterministic step, never an agent step + +## File Materialization: Verify Before Proceeding + +After any step that creates files, add a deterministic `file_exists` check before proceeding. Non-interactive agents may exit 0 without writing anything (wrong cwd, stdout instead of disk). + +```yaml +- name: verify-files + type: deterministic + dependsOn: [impl-auth, impl-storage] + command: | + missing=0 + for f in src/auth/credentials.ts src/storage/client.ts; do + if [ ! -f "$f" ]; then echo "MISSING: $f"; missing=$((missing+1)); fi + done + if [ $missing -gt 0 ]; then echo "$missing files missing"; exit 1; fi + echo "All files present" + failOnError: true +``` + +**Rules for file-writing tasks:** +1. Use full paths from project root — say `src/auth/credentials.ts`, not `credentials.ts` +2. Add `IMPORTANT: Write the file to disk. Do NOT output to stdout.` +3. Use `file_exists` verification for creation steps (not just `exit_code`) +4. Gate all downstream steps on the verify step + +## DAG Deadlock Anti-Pattern + +```yaml +# WRONG — deadlock: coordinate depends on context, work-a depends on coordinate +steps: + - name: coordinate + dependsOn: [context] # lead waits for WORKER_DONE... + - name: work-a + dependsOn: [coordinate] # ...but work-a can't start until coordinate finishes + +# RIGHT — workers and lead start in parallel +steps: + - name: context + type: deterministic + - name: work-a + dependsOn: [context] # starts with lead + - name: coordinate + dependsOn: [context] # starts with workers + - name: merge + dependsOn: [work-a, coordinate] +``` + +**Rule:** if a lead step's task mentions downstream step names alongside waiting keywords, that's a deadlock. + +## Step Sizing + +**One agent, one deliverable.** A step's task prompt should be 10-20 lines max. + +Split into a **lead + workers team** when: +- The task requires a 50+ line prompt +- The deliverable is multiple files that must be consistent +- You need one agent to verify another's output + +```yaml +# Team pattern: lead + workers on a shared channel +steps: + - name: track-lead-coord + agent: track-lead + dependsOn: [prior-step] + task: | + Lead the track on #my-track. Workers: track-worker-1, track-worker-2. + Post assignments to the channel. Review worker output. + + - name: track-worker-1-impl + agent: track-worker-1 + dependsOn: [prior-step] # same dep as lead — starts concurrently + task: | + Join #my-track. track-lead will post your assignment. + Implement the file as directed. + verification: + type: exit_code + + - name: next-step + dependsOn: [track-lead-coord] # downstream depends on lead, not workers +``` + +## Supervisor Pattern + +When you set `.pattern('supervisor')` (or `hub-spoke`, `fan-out`), the runner auto-assigns a supervisor agent as owner for worker steps. The supervisor monitors progress, nudges idle workers, and issues `OWNER_DECISION`. + +**Auto-hardening only activates for hub patterns** — not `pipeline` or `dag`. + +| Use case | Pattern | Why | +|----------|---------|-----| +| Sequential, no monitoring | `pipeline` | Simple, no overhead | +| Workers need oversight | `supervisor` | Auto-owner monitors | +| Local/small models | `supervisor` | Supervisor catches stuck workers | +| All non-interactive | `pipeline` or `dag` | No PTY = no supervision needed | + +## Concurrency + +**Cap `maxConcurrency` at 4-6.** Spawning 10+ agents simultaneously causes broker timeouts. + +| Parallel agents | `maxConcurrency` | +|-----------------|-------------------| +| 2-4 | 4 (default safe) | +| 5-10 | 5 | +| 10+ | 6-8 max | + +## Common Mistakes + +| Mistake | Fix | +|---------|-----| +| All workflows run sequentially | Group independent workflows into parallel waves (4-7x speedup) | +| Every step depends on the previous one | Only add `dependsOn` when there's a real data dependency | +| Self-review step with no timeout | Set `timeout: 300_000` (5 min) — Codex hangs in non-interactive review | +| One giant workflow per feature | Split into smaller workflows that can run in parallel waves | +| Adding exit instructions to tasks | Runner handles self-termination automatically | +| Setting `timeoutMs` on agents/steps | Use global `.timeout()` only | +| Using `general` channel | Set `.channel('wf-name')` for isolation | +| `{{steps.X.output}}` without `dependsOn: ['X']` | Output won't be available yet | +| Requiring exact sentinel as only completion gate | Use `exit_code` or `file_exists` verification | +| Writing 100-line task prompts | Split into lead + workers on a channel | +| `maxConcurrency: 16` with many parallel steps | Cap at 5-6 | +| Non-interactive agent reading large files via tools | Pre-read in deterministic step, inject via `{{steps.X.output}}` | +| Workers depending on lead step (deadlock) | Both depend on shared context step | +| `fan-out`/`hub-spoke` for simple parallel workers | Use `dag` instead | +| `pipeline` but expecting auto-supervisor | Only hub patterns auto-harden. Use `.pattern('supervisor')` | +| Workers without `preset: 'worker'` in one-shot DAG lead+worker flows | Add preset for clean stdout when chaining `{{steps.X.output}}` (not needed for interactive team patterns) | +| Using `_` in YAML numbers (`timeoutMs: 1_200_000`) | YAML doesn't support `_` separators | +| Workflow timeout under 30 min for complex workflows | Use `3600000` (1 hour) as default | +| Using `require()` in ESM projects | Check `package.json` for `"type": "module"` — use `import` if ESM | +| Wrapping in `async function main()` in ESM | ESM supports top-level `await` — no wrapper needed | +| Using `createWorkflowRenderer` | Does not exist. Use `.run({ cwd: process.cwd() })` | +| `export default workflow(...)...build()` | No `.build()`. Chain ends with `.run()` — the file must call `.run()`, not just export config | +| Relative import `'../workflows/builder.js'` | Use `import { workflow } from '@agent-relay/sdk/workflows'` | +| Hardcoded model strings (`model: 'opus'`) | Use constants: `import { ClaudeModels } from '@agent-relay/config'` → `model: ClaudeModels.OPUS` | +| Thinking `agent-relay run` inspects exports | It executes the file as a subprocess. Only `.run()` invocations trigger steps | +| `pattern('single')` on cloud runner | Not supported — use `dag` | +| `pattern('supervisor')` with one agent | Same agent is owner + specialist. Use `dag` | +| Invalid verification type (`type: 'deterministic'`) | Only `exit_code`, `output_contains`, `file_exists`, `custom` are valid | +| Chaining `{{steps.X.output}}` from interactive agents | PTY output is garbled. Use deterministic steps or `preset: 'worker'` | +| Single step editing 4+ files | Agents modify 1-2 then exit. Split to one file per step with verify gates | +| Relying on agents to `git commit` | Agents emit markers without running git. Use deterministic commit step | +| File-writing steps without `file_exists` verification | `exit_code` auto-passes even if no file written | +| Manual peer fanout in `handleChannelMessage()` | Use broker-managed channel subscriptions — broker fans out to all subscribers automatically | +| Client-side `personaNames.has(from)` filtering | Use `relay.subscribe()`/`relay.unsubscribe()` — only subscribed agents receive messages | +| Agents receiving noisy cross-channel messages during focused work | Use `relay.mute({ agent, channel })` to silence non-primary channels without leaving them | +| Hardcoding all channels at spawn time | Use `agent.subscribe()` / `agent.unsubscribe()` for dynamic channel membership post-spawn | +| Using `preset: 'worker'` for Codex in *interactive team* patterns when coordination is needed | Codex interactive mode works fine with PTY channel injection. Drop the preset for interactive team patterns (keep it for one-shot DAG workers where clean stdout matters) | +| Separate reviewer agent from lead in interactive team | Merge lead + reviewer into one interactive Claude agent — reviews between rounds, fewer agents | +| Not printing PR URL after `gh pr create` | Add a final deterministic step: `echo "PR: $(cat pr-url.txt)"` or capture in the `gh pr create` command | +| Workflow ending without worktree + PR for cross-repo changes | Add `setup-worktree` at start and `push-and-pr` + `cleanup-worktree` at end | + +## YAML Alternative + +```yaml +version: '1.0' +name: my-workflow +swarm: + pattern: dag + channel: wf-my-workflow +agents: + - name: lead + cli: claude + role: Architect + - name: worker + cli: codex + role: Implementer +workflows: + - name: default + steps: + - name: plan + agent: lead + task: 'Produce a detailed implementation plan.' + - name: implement + agent: worker + task: 'Implement: {{steps.plan.output}}' + dependsOn: [plan] + verification: + type: exit_code +``` + +Run with: `agent-relay run path/to/workflow.yaml` + +## Available Swarm Patterns + +`dag` (default), `fan-out`, `pipeline`, `hub-spoke`, `consensus`, `mesh`, `handoff`, `cascade`, `debate`, `hierarchical`, `map-reduce`, `scatter-gather`, `supervisor`, `reflection`, `red-team`, `verifier`, `auction`, `escalation`, `saga`, `circuit-breaker`, `blackboard`, `swarm` + +See skill `choosing-swarm-patterns` for pattern selection guidance. diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 5bd1294..dc37c38 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -65,6 +65,12 @@ jobs: - name: Install dependencies run: npm ci + # A subset of tests spawn the built trail CLI via child_process and + # require dist/cli/index.js to exist on disk (e.g. the compactWorkflow + # SDK helper + autoCompact E2E cases). Build before running tests. + - name: Build + run: npm run build + - name: Run tests run: npm run test:run diff --git a/.gitignore b/.gitignore index ed3930e..88d27f1 100644 --- a/.gitignore +++ b/.gitignore @@ -30,3 +30,4 @@ npm-debug.log* # Trajectories - don't commit active work .trajectories/active/ +.agent-relay/ diff --git a/.msd/autofix-findings-summary.txt b/.msd/autofix-findings-summary.txt new file mode 100644 index 0000000..f1acdaa --- /dev/null +++ b/.msd/autofix-findings-summary.txt @@ -0,0 +1,18 @@ +1. [HIGH] src/cli/commands/compact.ts — src/cli/commands/compact.ts +2. [HIGH] src/cli/commands/compact.ts — src/cli/commands/compact.ts +3. [MEDIUM] src/cli/commands/compact.ts — src/cli/commands/compact.ts +4. [MEDIUM] src/cli/commands/compact.ts — src/cli/commands/compact.ts +5. [MEDIUM] src/compact/provider.ts — src/compact/provider.ts +6. [MEDIUM] src/compact/provider.ts — src/compact/provider.ts +7. [MEDIUM] src/compact/provider.ts — src/compact/provider.ts +8. [MEDIUM] src/compact/provider.ts — src/compact/provider.ts +9. [MEDIUM] src/compact/provider.ts — src/compact/provider.ts +10. [MEDIUM] src/compact/provider.ts — src/compact/provider.ts +11. [MEDIUM] src/compact/provider.ts — src/compact/provider.ts +12. [MEDIUM] src/compact/provider.ts — src/compact/provider.ts +13. [MEDIUM] workflows/llm-compaction.ts — workflows/llm-compaction.ts +14. [MEDIUM] src/compact/parser.ts — src/compact/parser.ts +15. [MEDIUM] src/compact/config.ts — src/compact/config.ts +16. [MEDIUM] package.json — package.json +17. [LOW] src/compact/provider.ts — src/compact/provider.ts +18. [LOW] tests/compact/llm-compact.test.ts — tests/compact/llm-compact.test.ts diff --git a/.msd/autofix-plan.json b/.msd/autofix-plan.json new file mode 100644 index 0000000..e8f5451 --- /dev/null +++ b/.msd/autofix-plan.json @@ -0,0 +1,57 @@ +{ + "groups": [ + { + "id": "group-1", + "label": "compact.ts command fixes — shell injection, env mutation, type dedup, jsonMode", + "domain": "security", + "findings": [ + "src/cli/commands/compact.ts-Shell injection in getBranchCommits (line 403)-security-review, developer-review, historian-review-high", + "src/cli/commands/compact.ts-Global process.env mutation in storage loop (lines 313-314)-developer-review-high", + "src/cli/commands/compact.ts-Duplicate/conflicting CompactedTrajectory types (lines 52-77)-developer-review-medium", + "src/cli/commands/compact.ts-jsonMode inconsistency across providers (line 234)-historian-review-medium" + ], + "files": ["src/cli/commands/compact.ts"], + "rationale": "All 4 findings in the same file; includes both HIGH severity issues (shell injection, env mutation)" + }, + { + "id": "group-2", + "label": "compact provider fixes — SSRF, API keys, timeouts, env passthrough, error leaks, types", + "domain": "security", + "findings": [ + "src/compact/provider.ts-SSRF via configurable base URLs (lines 72, 128)-security-review-medium", + "src/compact/provider.ts-Empty/whitespace API key handling (lines 66, 121)-developer-review-medium", + "src/compact/provider.ts-Anthropic fallback prompt fabrication (lines 152-168)-developer-review-medium", + "src/compact/provider.ts-Missing fetch timeouts (lines 83-96)-historian-review-medium", + "src/compact/provider.ts-Hardcoded Anthropic API version (line 152)-historian-review-medium", + "src/compact/provider.ts-Duplicate Message interface (lines 7-10)-developer-review-medium", + "src/compact/provider.ts-CLI arg length limits (lines 269-273)-historian-review-medium", + "src/compact/provider.ts-Full env passthrough to CLI subprocesses (lines 229-233)-security-review-medium", + "src/compact/provider.ts-Error message data leak in parseJson (line 348)-security-review-low" + ], + "files": ["src/compact/provider.ts"], + "rationale": "All 9 findings are in src/compact/provider.ts — cannot split across workers due to file-conflict rule" + }, + { + "id": "group-3", + "label": "supporting files — parser, config, workflow, package.json, tests", + "domain": "code-quality", + "findings": [ + "workflows/llm-compaction.ts-Hardcoded absolute path (line 26)-historian-review, security-review, developer-review-medium", + "src/compact/parser.ts-Incomplete escape sequence handling in extractBalancedJsonObject (lines 91-134)-developer-review, historian-review-medium", + "src/compact/config.ts-Implicit config merge precedence (lines 61-68)-developer-review, historian-review-medium", + "package.json-@agent-relay/sdk as regular dependency-historian-review-medium", + "tests/compact/llm-compact.test.ts-No mocked LLM provider integration test (lines 152-201)-developer-review-low" + ], + "files": [ + "workflows/llm-compaction.ts", + "src/compact/parser.ts", + "src/compact/config.ts", + "package.json", + "tests/compact/llm-compact.test.ts" + ], + "rationale": "Remaining files grouped together; all in compact domain but distinct files from groups 1-2" + } + ], + "totalGroups": 3, + "conflictCheck": "no file appears in multiple groups" +} diff --git a/AGENTS.md b/AGENTS.md index f847f19..48c5242 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -140,3 +140,156 @@ QUESTION: JWT or sessions?>>> - Escape with `\->relay:` to output literally - Check daemon status: `agent-relay status` + + +# Trail + +Record your work as a trajectory for future agents and humans to follow. + +## Usage + +If `trail` is installed globally, run commands directly: +```bash +trail start "Task description" +``` + +If not globally installed, use npx to run from local installation: +```bash +npx trail start "Task description" +``` + +## When Starting Work + +Start a trajectory when beginning a task: + +```bash +trail start "Implement user authentication" +``` + +With external task reference: +```bash +trail start "Fix login bug" --task "ENG-123" +``` + +## Recording Decisions + +Record key decisions as you work: + +```bash +trail decision "Chose JWT over sessions" \ + --reasoning "Stateless scaling requirements" +``` + +For minor decisions, reasoning is optional: +```bash +trail decision "Used existing auth middleware" +``` + +**Record decisions when you:** +- Choose between alternatives +- Make architectural trade-offs +- Decide on an approach after investigation + +## Recording Reflections + +Periodically step back and synthesize progress: + +```bash +trail reflect "Workers aligned on auth approach, API layer progressing well" \ + --confidence 0.8 +``` + +With focal points and adjustments: +```bash +trail reflect "Frontend and backend duplicating validation logic" \ + --focal-points "duplication,ownership" \ + --adjustments "Reassigning validation to backend team" \ + --confidence 0.7 +``` + +**Record reflections when you:** +- Have received several updates and need to synthesize the big picture +- Notice workers or tasks diverging from the plan +- Want to course-correct before continuing +- Are coordinating multiple agents and need to assess overall progress + +Reflections differ from decisions: decisions record a specific choice, +reflections record a higher-level synthesis of what's happening and whether +the current approach is working. + +## Completing Work + +When done, complete with a retrospective: + +```bash +trail complete --summary "Added JWT auth with refresh tokens" --confidence 0.85 +``` + +**Confidence levels:** +- 0.9+ : High confidence, well-tested +- 0.7-0.9 : Good confidence, standard implementation +- 0.5-0.7 : Some uncertainty, edge cases possible +- <0.5 : Significant uncertainty, needs review + +## Abandoning Work + +If you need to stop without completing: + +```bash +trail abandon --reason "Blocked by missing API credentials" +``` + +## Checking Status + +View current trajectory: +```bash +trail status +``` + +## Listing and Viewing Trajectories + +List all trajectories: +```bash +trail list +``` + +View a specific trajectory: +```bash +trail show +``` + +Export a trajectory (markdown, json, timeline, html, pr-summary): +```bash +trail export --format markdown +``` + +## Compacting Trajectories + +After a PR merge, compact related trajectories into a single summary: + +```bash +trail compact --pr 42 +``` + +Compact by branch: +```bash +trail compact --branch feature/auth +``` + +Compact by commit range: +```bash +trail compact --commits abc123..def456 +``` + +Compaction consolidates decisions and creates a grouped summary, reducing noise while preserving key decisions. + +## Why Trail? + +Your trajectory helps others understand: +- **What** you built (commits show this) +- **Why** you built it this way (trajectory shows this) +- **What alternatives** you considered +- **What challenges** you faced + +Future agents can query past trajectories to learn from your decisions. + diff --git a/package-lock.json b/package-lock.json index 5b18b43..2a1269b 100644 --- a/package-lock.json +++ b/package-lock.json @@ -220,6 +220,7 @@ }, "node_modules/@clack/prompts/node_modules/is-unicode-supported": { "version": "1.3.0", + "extraneous": true, "inBundle": true, "license": "MIT", "engines": { @@ -1375,14 +1376,14 @@ } }, "node_modules/cli-truncate": { - "version": "5.1.1", - "resolved": "https://registry.npmjs.org/cli-truncate/-/cli-truncate-5.1.1.tgz", - "integrity": "sha512-SroPvNHxUnk+vIW/dOSfNqdy1sPEFkrTk6TUtqLCnBlo3N7TNYYkzzN7uSD6+jVjrdO4+p8nH7JzH6cIvUem6A==", + "version": "5.2.0", + "resolved": "https://registry.npmjs.org/cli-truncate/-/cli-truncate-5.2.0.tgz", + "integrity": "sha512-xRwvIOMGrfOAnM1JYtqQImuaNtDEv9v6oIYAs4LIHwTiKee8uwvIi363igssOC0O5U04i4AlENs79LQLu9tEMw==", "dev": true, "license": "MIT", "dependencies": { - "slice-ansi": "^7.1.0", - "string-width": "^8.0.0" + "slice-ansi": "^8.0.0", + "string-width": "^8.2.0" }, "engines": { "node": ">=20" @@ -1391,6 +1392,23 @@ "url": "https://github.com/sponsors/sindresorhus" } }, + "node_modules/cli-truncate/node_modules/slice-ansi": { + "version": "8.0.0", + "resolved": "https://registry.npmjs.org/slice-ansi/-/slice-ansi-8.0.0.tgz", + "integrity": "sha512-stxByr12oeeOyY2BlviTNQlYV5xOj47GirPr4yA1hE9JCtxfQN0+tVbkxwCtYDQWhEKWFHsEK48ORg5jrouCAg==", + "dev": true, + "license": "MIT", + "dependencies": { + "ansi-styles": "^6.2.3", + "is-fullwidth-code-point": "^5.1.0" + }, + "engines": { + "node": ">=20" + }, + "funding": { + "url": "https://github.com/chalk/slice-ansi?sponsor=1" + } + }, "node_modules/colorette": { "version": "2.0.20", "resolved": "https://registry.npmjs.org/colorette/-/colorette-2.0.20.tgz", diff --git a/prpm.lock b/prpm.lock index 9136f4b..0eb03e8 100644 --- a/prpm.lock +++ b/prpm.lock @@ -112,7 +112,146 @@ "sourceFormat": "claude", "sourceSubtype": "skill", "installedPath": ".claude/skills/creating-skills-skill/SKILL.md" + }, + "@agent-relay/choosing-swarm-patterns#claude": { + "version": "1.0.0", + "resolved": "https://registry.prpm.dev/api/v1/packages/%40agent-relay%2Fchoosing-swarm-patterns/1.0.0.tar.gz", + "integrity": "sha256-2b28661abb540c56b46ad980b238589c6dcf59faaa3e66c80c72f72c01407f38", + "format": "claude", + "subtype": "skill", + "sourceFormat": "claude", + "sourceSubtype": "skill", + "installedPath": ".claude/skills/choosing-swarm-patterns/SKILL.md", + "fromCollection": { + "name_slug": "agent-relay-starter", + "version": "1.0.3" + } + }, + "@agent-relay/writing-agent-relay-workflows#claude": { + "version": "1.4.0", + "resolved": "https://registry.prpm.dev/api/v1/packages/%40agent-relay%2Fwriting-agent-relay-workflows/1.4.0.tar.gz", + "integrity": "sha256-9bbb0861f470beedde502569c7e8d665808a82bc3ca40029d864eb7f0420419d", + "format": "claude", + "subtype": "skill", + "sourceFormat": "claude", + "sourceSubtype": "skill", + "installedPath": ".claude/skills/writing-agent-relay-workflows/SKILL.md", + "fromCollection": { + "name_slug": "agent-relay-starter", + "version": "1.0.3" + } + }, + "@agent-workforce/trail-snippet#claude:AGENTS.md": { + "version": "1.1.0", + "resolved": "https://registry.prpm.dev/api/v1/packages/%40agent-workforce%2Ftrail-snippet/1.1.0.tar.gz", + "integrity": "sha256-8d8824b5236660e18c780b3574ee756737401d9a77e843411cdd509ceb9e0668", + "format": "claude", + "subtype": "snippet", + "sourceFormat": "generic", + "sourceSubtype": "snippet", + "installedPath": "AGENTS.md", + "fromCollection": { + "name_slug": "agent-relay-starter", + "version": "1.0.3" + }, + "snippetMetadata": { + "targetPath": "AGENTS.md", + "config": { + "target": "AGENTS.md", + "position": "append" + } + } + }, + "@agent-relay/relay-80-100-workflow#claude": { + "version": "1.0.0", + "resolved": "https://registry.prpm.dev/api/v1/packages/%40agent-relay%2Frelay-80-100-workflow/1.0.0.tar.gz", + "integrity": "sha256-b0e70fc43fa009f6cd9d8ae965f6e318b6b4724fb34c6378233ef3a4bd09cfb9", + "format": "claude", + "subtype": "skill", + "sourceFormat": "claude", + "sourceSubtype": "skill", + "installedPath": ".claude/skills/relay-80-100-workflow/SKILL.md", + "fromCollection": { + "name_slug": "agent-relay-starter", + "version": "1.0.3" + } + }, + "@agent-relay/choosing-swarm-patterns#codex": { + "version": "1.0.0", + "resolved": "https://registry.prpm.dev/api/v1/packages/%40agent-relay%2Fchoosing-swarm-patterns/1.0.0.tar.gz", + "integrity": "sha256-2b28661abb540c56b46ad980b238589c6dcf59faaa3e66c80c72f72c01407f38", + "format": "codex", + "subtype": "skill", + "sourceFormat": "claude", + "sourceSubtype": "skill", + "installedPath": ".agents/skills/choosing-swarm-patterns/SKILL.md", + "fromCollection": { + "name_slug": "agent-relay-starter", + "version": "1.0.3" + } + }, + "@agent-relay/writing-agent-relay-workflows#codex": { + "version": "1.4.0", + "resolved": "https://registry.prpm.dev/api/v1/packages/%40agent-relay%2Fwriting-agent-relay-workflows/1.4.0.tar.gz", + "integrity": "sha256-9bbb0861f470beedde502569c7e8d665808a82bc3ca40029d864eb7f0420419d", + "format": "codex", + "subtype": "skill", + "sourceFormat": "claude", + "sourceSubtype": "skill", + "installedPath": ".agents/skills/writing-agent-relay-workflows/SKILL.md", + "fromCollection": { + "name_slug": "agent-relay-starter", + "version": "1.0.3" + } + }, + "@agent-workforce/trail-snippet#codex:AGENTS.md": { + "version": "1.1.0", + "resolved": "https://registry.prpm.dev/api/v1/packages/%40agent-workforce%2Ftrail-snippet/1.1.0.tar.gz", + "integrity": "sha256-8d8824b5236660e18c780b3574ee756737401d9a77e843411cdd509ceb9e0668", + "format": "codex", + "subtype": "snippet", + "sourceFormat": "generic", + "sourceSubtype": "snippet", + "installedPath": "AGENTS.md", + "fromCollection": { + "name_slug": "agent-relay-starter", + "version": "1.0.3" + }, + "snippetMetadata": { + "targetPath": "AGENTS.md", + "config": { + "target": "AGENTS.md", + "position": "append" + } + } + }, + "@agent-relay/relay-80-100-workflow#codex": { + "version": "1.0.0", + "resolved": "https://registry.prpm.dev/api/v1/packages/%40agent-relay%2Frelay-80-100-workflow/1.0.0.tar.gz", + "integrity": "sha256-b0e70fc43fa009f6cd9d8ae965f6e318b6b4724fb34c6378233ef3a4bd09cfb9", + "format": "codex", + "subtype": "skill", + "sourceFormat": "claude", + "sourceSubtype": "skill", + "installedPath": ".agents/skills/relay-80-100-workflow/SKILL.md", + "fromCollection": { + "name_slug": "agent-relay-starter", + "version": "1.0.3" + } } }, - "generated": "2026-02-20T16:18:25.877Z" + "generated": "2026-04-12T07:15:45.434Z", + "collections": { + "agent-relay-starter": { + "name_slug": "agent-relay-starter", + "version": "1.0.3", + "installedAt": "2026-04-12T07:15:45.434Z", + "packages": [ + "@agent-relay/choosing-swarm-patterns", + "@agent-relay/writing-agent-relay-workflows", + "@agent-workforce/trail-snippet", + "@agent-relay/relay-80-100-workflow" + ] + } + } } \ No newline at end of file diff --git a/scripts/autocompact-probe.mts b/scripts/autocompact-probe.mts new file mode 100644 index 0000000..190e457 --- /dev/null +++ b/scripts/autocompact-probe.mts @@ -0,0 +1,48 @@ +/** + * autocompact-probe.mts + * + * Shared probe script used by workflows/sdk-autocompact-option.ts for + * BEFORE/AFTER validation of the TrajectoryClient.autoCompact option. + * + * Usage: + * cd + * TRAJECTORIES_WORKFLOW_ID= PROBE_AUTOCOMPACT=true \ + * npx tsx /abs/path/to/scripts/autocompact-probe.mts + * + * Env vars: + * TRAJECTORIES_WORKFLOW_ID — stamped onto the trajectory so + * `trail compact --workflow ` can select it. + * PROBE_AUTOCOMPACT — "true" turns on autoCompact: { mechanical: true, + * markdown: true }. Any other value leaves autoCompact unset, which + * is the BEFORE baseline behavior. + * + * The script uses `as never` casts on the autoCompact option so it is + * safe to run even when the option doesn't exist on the Trajectory type + * yet (pre-implementation). That lets the same file serve both BEFORE + * (autoCompact ignored) and AFTER (autoCompact honored) runs. + */ + +import { TrajectoryClient } from "../src/sdk/index.js"; + +async function main() { + const options: Record = { defaultAgent: "probe" }; + if (process.env.PROBE_AUTOCOMPACT === "true") { + options.autoCompact = { mechanical: true, markdown: true }; + } + + const client = new TrajectoryClient(options as never); + await client.init(); + + const session = await client.start("autocompact probe"); + await session.decide("Probe decision one", "chosen A", "because reasons"); + await session.decide("Probe decision two", "chosen B", "more reasons"); + await session.done("autocompact probe complete", 0.95); + await client.close(); + + console.log(`PROBE_OK id=${session.id}`); +} + +main().catch((err) => { + console.error(err); + process.exit(1); +}); diff --git a/scripts/benchmark-compaction.ts b/scripts/benchmark-compaction.ts new file mode 100644 index 0000000..fe35299 --- /dev/null +++ b/scripts/benchmark-compaction.ts @@ -0,0 +1,142 @@ +#!/usr/bin/env tsx +/** + * Reproducible benchmark fixture for the LLM compaction feature. + * + * Records a deliberately noisy trajectory (3 chapters, 5+ decisions, 10+ + * findings, 15+ low-significance noise events) using the default + * TrajectoryClient storage (always `/.trajectories/`). The caller is + * expected to `cd` into an isolated directory before invoking this script. + * + * Reads TRAJECTORIES_WORKFLOW_ID from the environment and forwards it to + * the trajectory via createTrajectory's `workflowId` option if accepted by + * the SDK; otherwise the SDK picks it up from the env var directly. On + * pre-feature runs this is a no-op, which is expected. + * + * Prints exactly one line to stdout: `TRAJECTORY_ID=` + */ + +import { TrajectoryClient } from "../src/sdk/index.js"; + +async function main(): Promise { + const workflowId = process.env.TRAJECTORIES_WORKFLOW_ID; + const client = new TrajectoryClient({ defaultAgent: "benchmark-agent" }); + await client.init(); + + // Forward the workflow id via start() options if the SDK accepts it. + // On pre-feature runs the field is unknown to CreateTrajectoryInput, so + // we widen the cast — the SDK will simply ignore unrecognised keys, and + // once the feature lands the env var path also works. + const startOpts = workflowId + ? ({ workflowId } as unknown as Parameters[1]) + : undefined; + + const session = await client.start( + "Benchmark: noisy compaction sample", + startOpts, + ); + + // ---------- Chapter 1: Investigation ---------- + await session.chapter("Investigation", "benchmark-agent"); + await session.finding("Repository has 47 TypeScript modules under src/"); + await session.finding("Compaction target: events with significance=low"); + await session.finding("Existing exporters: markdown, json, timeline, pr"); + await session.event("tool_call", "rg --files src/", { significance: "low" }); + await session.event("tool_result", "47 files matched", { + significance: "low", + }); + await session.event("thinking", "Considering which paths to scan first", { + significance: "low", + }); + await session.event("tool_call", "cat src/sdk/client.ts", { + significance: "low", + }); + await session.event("tool_result", "file 580 lines", { significance: "low" }); + await session.event("thinking", "Client looks stable; no API churn needed", { + significance: "low", + }); + await session.decide( + "Where to anchor the benchmark?", + "scripts/benchmark-compaction.ts", + "Co-locates with future fixtures and stays out of src/", + [{ option: "tests/fixtures/", reason: "Mixes fixtures with unit tests" }], + ); + await session.decide( + "How to pass workflow id?", + "Read TRAJECTORIES_WORKFLOW_ID and forward through start options", + "Matches the env-var contract spec", + ); + + // ---------- Chapter 2: Implementation ---------- + await session.chapter("Implementation", "benchmark-agent"); + await session.finding( + "TrajectoryClient defaults storage to /.trajectories", + ); + await session.finding("Session API is fully chainable and auto-saves"); + await session.finding("addEvent accepts arbitrary significance levels"); + await session.event("tool_call", "rg TrajectoryClient src/sdk", { + significance: "low", + }); + await session.event("tool_result", "12 hits", { significance: "low" }); + await session.event("thinking", "Will reuse default storage path", { + significance: "low", + }); + await session.event("tool_call", "ls .trajectories", { significance: "low" }); + await session.event("tool_result", "active/ completed/", { + significance: "low", + }); + await session.event("thinking", "Storage already initialised", { + significance: "low", + }); + await session.event("tool_call", "node --version", { significance: "low" }); + await session.event("tool_result", "v20.11.0", { significance: "low" }); + await session.decide( + "Auto-save vs manual save?", + "Auto-save (default)", + "Removes a class of fixture flake", + ); + await session.decide( + "Noise volume?", + "15+ low-significance events", + "Gives the compactor enough signal to demonstrate reduction", + ); + + // ---------- Chapter 3: Validation ---------- + await session.chapter("Validation", "benchmark-agent"); + await session.finding("Trajectory has 3 chapters as required"); + await session.finding("Decision count >= 5"); + await session.finding("Finding count >= 10"); + await session.finding("Low-significance event count >= 15"); + await session.event("tool_call", "wc -l scripts/benchmark-compaction.ts", { + significance: "low", + }); + await session.event("tool_result", "under 120 lines", { + significance: "low", + }); + await session.event("thinking", "All quotas met", { significance: "low" }); + await session.decide( + "Print format?", + "Single TRAJECTORY_ID= line on stdout", + "Easiest for the harness to parse", + ); + + await session.complete({ + summary: + "Recorded a deliberately noisy 3-chapter sample trajectory for compaction benchmarks.", + confidence: 0.95, + approach: + "Use TrajectoryClient defaults; emit 5+ decisions, 10+ findings, and 15+ noise events across three chapters.", + challenges: ["Balancing noise volume vs script size budget"], + learnings: ["TrajectorySession's chainable API keeps fixture code compact"], + suggestions: [ + "Reuse this script as the canonical before/after compaction input", + ], + }); + + process.stdout.write(`TRAJECTORY_ID=${session.id}\n`); + await client.close(); +} + +main().catch((err) => { + console.error(err); + process.exit(1); +}); diff --git a/src/cli/commands/compact.ts b/src/cli/commands/compact.ts index cf5fe82..3ec5f82 100644 --- a/src/cli/commands/compact.ts +++ b/src/cli/commands/compact.ts @@ -8,10 +8,28 @@ * Default behavior: compact only trajectories that haven't been compacted yet. */ -import { execSync } from "node:child_process"; +import { execFileSync } from "node:child_process"; import { existsSync, mkdirSync, readFileSync, writeFileSync } from "node:fs"; -import { join } from "node:path"; +import { dirname, join } from "node:path"; import type { Command } from "commander"; +import { getCompactionConfig } from "../../compact/config.js"; +import { generateCompactionMarkdown } from "../../compact/markdown.js"; +import { + type CompactedTrajectoryMetadata, + type LLMCompactedOutput, + mergeCompactionWithMetadata, + parseCompactionResponse, +} from "../../compact/parser.js"; +import { buildCompactionPrompt } from "../../compact/prompts.js"; +import { + AnthropicProvider, + CLIProvider, + type CompactionLLM, + type Message, + OpenAIProvider, + resolveProvider, +} from "../../compact/provider.js"; +import { serializeForLLM } from "../../compact/serializer.js"; import { generateRandomId } from "../../core/id.js"; import type { Decision, Trajectory } from "../../core/types.js"; import { FileStorage, getSearchPaths } from "../../storage/file.js"; @@ -30,28 +48,19 @@ interface DecisionGroup { } /** - * Compacted trajectory summary + * Compacted trajectory summary — extends the shared metadata type from parser.ts + * with mechanical compaction fields and optional LLM output. */ -interface CompactedTrajectory { - id: string; - version: 1; - type: "compacted"; - compactedAt: string; - sourceTrajectories: string[]; - dateRange: { - start: string; - end: string; - }; - summary: { - totalDecisions: number; - totalEvents: number; - uniqueAgents: string[]; - }; +interface CompactedTrajectory extends CompactedTrajectoryMetadata { decisionGroups: DecisionGroup[]; keyLearnings: string[]; keyFindings: string[]; - filesAffected: string[]; - commits: string[]; + workflowId?: string; + narrative?: string; + decisions?: LLMCompactedOutput["decisions"]; + conventions?: LLMCompactedOutput["conventions"]; + lessons?: LLMCompactedOutput["lessons"]; + openQuestions?: string[]; } /** @@ -66,6 +75,30 @@ interface IndexEntry { compactedInto?: string; } +interface CompactCommandOptions { + since?: string; + until?: string; + ids?: string; + workflow?: string; + pr?: string; + branch?: string; + commits?: string; + all?: boolean; + llm?: boolean; + mechanical?: boolean; + focus?: string; + markdown?: boolean; + dryRun?: boolean; + output?: string; +} + +interface LLMCompactionPlan { + messages: Message[]; + estimatedInputTokens: number; + estimatedOutputTokens: number; + focusAreas: string[]; +} + export function registerCompactCommand(program: Command): void { program .command("compact") @@ -81,6 +114,10 @@ export function registerCompactCommand(program: Command): void { "Include trajectories until this date (ISO format)", ) .option("--ids ", "Comma-separated list of trajectory IDs to compact") + .option( + "--workflow ", + "Compact trajectories with the specified workflow ID", + ) .option("--pr ", "Compact trajectories associated with a PR number") .option( "--branch ", @@ -91,9 +128,18 @@ export function registerCompactCommand(program: Command): void { "Comma-separated commit SHAs to match trajectories against", ) .option("--all", "Include all trajectories, even previously compacted ones") + .option("--llm", "Use LLM-based compaction when a provider is available") + .option("--no-llm", "Disable LLM-based compaction") + .option("--mechanical", "Force the original mechanical compaction flow") + .option( + "--focus ", + "Comma-separated focus areas to emphasize in LLM compaction", + ) + .option("--markdown", "Also write a Markdown companion file") + .option("--no-markdown", "Skip writing a Markdown companion file") .option("--dry-run", "Preview what would be compacted without saving") .option("--output ", "Output path for compacted trajectory") - .action(async (options) => { + .action(async (options: CompactCommandOptions) => { const trajectories = await loadTrajectories(options); if (trajectories.length === 0) { @@ -101,6 +147,7 @@ export function registerCompactCommand(program: Command): void { options.all || options.since || options.ids || + options.workflow || options.pr || options.branch || options.commits @@ -116,22 +163,103 @@ export function registerCompactCommand(program: Command): void { console.log(`Compacting ${trajectories.length} trajectories...\n`); - const compacted = compactTrajectories(trajectories); + const config = getCompactionConfig(); + const provider = await resolveProvider(config); + const useLLM = shouldUseLLM(options, provider !== null); + const markdownEnabled = options.markdown !== false; + const mechanicalCompacted = compactTrajectories( + trajectories, + options.workflow, + ); + + if (!useLLM || provider === null) { + if (options.llm && provider === null && !options.mechanical) { + console.log( + "No LLM provider detected; falling back to mechanical compaction.\n", + ); + } + + if (options.dryRun) { + console.log("=== DRY RUN - Preview ===\n"); + printCompactedSummary(mechanicalCompacted); + return; + } + + const outputPath = + options.output || + getDefaultOutputPath(mechanicalCompacted, options.workflow); + saveCompactionArtifacts( + mechanicalCompacted, + outputPath, + markdownEnabled, + ); + await markTrajectoriesAsCompacted(trajectories, mechanicalCompacted.id); + + console.log(`\nCompacted trajectory saved to: ${outputPath}`); + if (markdownEnabled) { + console.log( + `Markdown summary saved to: ${getMarkdownOutputPath(outputPath)}`, + ); + } + printCompactedSummary(mechanicalCompacted); + return; + } + + const llmPlan = buildLLMCompactionPlan( + trajectories, + parseFocusAreas(options.focus), + config.maxInputTokens, + config.maxOutputTokens, + ); + + console.log( + `Using ${getProviderLabel(provider)} compaction${config.model ? ` with model ${config.model}` : ""}.`, + ); + console.log( + `Estimated: ~${llmPlan.estimatedInputTokens} input tokens, ~${llmPlan.estimatedOutputTokens} output tokens`, + ); if (options.dryRun) { - console.log("=== DRY RUN - Preview ===\n"); - printCompactedSummary(compacted); + printLLMDryRun(llmPlan, config.model, options.workflow); return; } - // Save the compacted trajectory - const outputPath = options.output || getDefaultOutputPath(compacted); - saveCompactedTrajectory(compacted, outputPath); + const llmOutput = await provider.complete(llmPlan.messages, { + maxTokens: config.maxOutputTokens, + temperature: config.temperature, + jsonMode: provider instanceof OpenAIProvider, + }); + const llmCompacted = parseCompactionResponse(llmOutput); + const mergedCompaction = mergeCompactionWithMetadata( + { + id: mechanicalCompacted.id, + version: mechanicalCompacted.version, + type: mechanicalCompacted.type, + compactedAt: mechanicalCompacted.compactedAt, + sourceTrajectories: mechanicalCompacted.sourceTrajectories, + dateRange: mechanicalCompacted.dateRange, + summary: mechanicalCompacted.summary, + filesAffected: mechanicalCompacted.filesAffected, + commits: mechanicalCompacted.commits, + }, + llmCompacted, + ); + const compacted: CompactedTrajectory = { + ...mechanicalCompacted, + ...mergedCompaction, + }; - // Mark source trajectories as compacted + const outputPath = + options.output || getDefaultOutputPath(compacted, options.workflow); + saveCompactionArtifacts(compacted, outputPath, markdownEnabled); await markTrajectoriesAsCompacted(trajectories, compacted.id); console.log(`\nCompacted trajectory saved to: ${outputPath}`); + if (markdownEnabled) { + console.log( + `Markdown summary saved to: ${getMarkdownOutputPath(outputPath)}`, + ); + } printCompactedSummary(compacted); }); } @@ -140,6 +268,7 @@ async function loadTrajectories(options: { since?: string; until?: string; ids?: string; + workflow?: string; pr?: string; branch?: string; commits?: string; @@ -181,80 +310,84 @@ async function loadTrajectories(options: { for (const searchPath of searchPaths) { if (!existsSync(searchPath)) continue; + // Set env var only for the synchronous FileStorage constructor, then + // immediately restore to avoid leaking state across async boundaries. const originalDataDir = process.env.TRAJECTORIES_DATA_DIR; process.env.TRAJECTORIES_DATA_DIR = searchPath; + const storage = new FileStorage(); + if (originalDataDir !== undefined) { + process.env.TRAJECTORIES_DATA_DIR = originalDataDir; + } else { + // biome-ignore lint/performance/noDelete: process.env requires delete to truly unset (assignment stores string "undefined") + delete process.env.TRAJECTORIES_DATA_DIR; + } - try { - const storage = new FileStorage(); - await storage.initialize(); + await storage.initialize(); - const summaries = await storage.list({ - status: "completed", - limit: Number.MAX_SAFE_INTEGER, - }); + const summaries = await storage.list({ + status: "completed", + limit: Number.MAX_SAFE_INTEGER, + }); + + for (const summary of summaries) { + if (seenIds.has(summary.id)) continue; + + // Skip already compacted (unless --all) + if (compactedIds.has(summary.id)) continue; + + // Filter by IDs if specified + if (targetIds && !targetIds.includes(summary.id)) continue; + + // Filter by date range + const startDate = new Date(summary.startedAt); + if (sinceDate && startDate < sinceDate) continue; + if (untilDate && startDate > untilDate) continue; - for (const summary of summaries) { - if (seenIds.has(summary.id)) continue; - - // Skip already compacted (unless --all) - if (compactedIds.has(summary.id)) continue; - - // Filter by IDs if specified - if (targetIds && !targetIds.includes(summary.id)) continue; - - // Filter by date range - const startDate = new Date(summary.startedAt); - if (sinceDate && startDate < sinceDate) continue; - if (untilDate && startDate > untilDate) continue; - - // Load full trajectory - const trajectory = await storage.get(summary.id); - if (trajectory) { - seenIds.add(summary.id); - - // Filter by PR if specified - if (options.pr) { - const escaped = options.pr.replace(/[.*+?^${}()|[\]\\]/g, "\\$&"); - // Match "#N" or "PR #N" / "PR N" patterns, requiring word boundaries - // to avoid false matches on words containing "pr" (e.g., "Improve") - const prPattern = new RegExp( - `#${escaped}\\b|\\bPR\\s*#?\\s*${escaped}\\b`, - "i", - ); - const matchesPR = - prPattern.test(trajectory.task.title) || - prPattern.test(trajectory.task.description || "") || - trajectory.commits.some((c) => prPattern.test(c)); - - if (!matchesPR) continue; - } - - // Filter by branch if specified - if (branchCommits) { - const hasMatchingCommit = trajectory.commits.some( - (c) => branchCommits.has(c.slice(0, 7)) || branchCommits.has(c), - ); - if (!hasMatchingCommit && trajectory.commits.length > 0) continue; - // Include trajectories with no commits (they might still be relevant) - } - - // Filter by commits if specified - if (targetCommits) { - const hasMatchingCommit = trajectory.commits.some( - (c) => targetCommits.has(c) || targetCommits.has(c.slice(0, 7)), - ); - if (!hasMatchingCommit) continue; - } - - trajectories.push(trajectory); + // Load full trajectory + const trajectory = await storage.get(summary.id); + if (trajectory) { + seenIds.add(summary.id); + + // Filter by workflow if specified + if (options.workflow && trajectory.workflowId !== options.workflow) { + continue; } - } - } finally { - if (originalDataDir !== undefined) { - process.env.TRAJECTORIES_DATA_DIR = originalDataDir; - } else { - // biome-ignore lint/performance/noDelete: process.env requires delete to truly unset (assignment stores string "undefined") - delete process.env.TRAJECTORIES_DATA_DIR; + + // Filter by PR if specified + if (options.pr) { + const escaped = options.pr.replace(/[.*+?^${}()|[\]\\]/g, "\\$&"); + // Match "#N" or "PR #N" / "PR N" patterns, requiring word boundaries + // to avoid false matches on words containing "pr" (e.g., "Improve") + const prPattern = new RegExp( + `#${escaped}\\b|\\bPR\\s*#?\\s*${escaped}\\b`, + "i", + ); + const matchesPR = + prPattern.test(trajectory.task.title) || + prPattern.test(trajectory.task.description || "") || + trajectory.commits.some((c) => prPattern.test(c)); + + if (!matchesPR) continue; + } + + // Filter by branch if specified + if (branchCommits) { + const hasMatchingCommit = trajectory.commits.some( + (c) => branchCommits.has(c.slice(0, 7)) || branchCommits.has(c), + ); + if (!hasMatchingCommit && trajectory.commits.length > 0) continue; + // Include trajectories with no commits (they might still be relevant) + } + + // Filter by commits if specified + if (targetCommits) { + const hasMatchingCommit = trajectory.commits.some( + (c) => targetCommits.has(c) || targetCommits.has(c.slice(0, 7)), + ); + if (!hasMatchingCommit) continue; + } + + trajectories.push(trajectory); } } } @@ -270,8 +403,9 @@ function getBranchCommits(targetBranch: string): Set { try { // Get commits on HEAD that are not in target branch - const output = execSync( - `git log '${targetBranch.replace(/'/g, "'\\''")}'..HEAD --format=%H`, + const output = execFileSync( + "git", + ["log", `${targetBranch}..HEAD`, "--format=%H"], { encoding: "utf-8", stdio: ["pipe", "pipe", "pipe"], @@ -387,7 +521,10 @@ function parseRelativeDate(input: string): Date { return new Date(input); } -function compactTrajectories(trajectories: Trajectory[]): CompactedTrajectory { +function compactTrajectories( + trajectories: Trajectory[], + workflowId?: string, +): CompactedTrajectory { const allDecisions: Array<{ decision: Decision; fromTrajectory: string; @@ -474,6 +611,7 @@ function compactTrajectories(trajectories: Trajectory[]): CompactedTrajectory { version: 1, type: "compacted", compactedAt: new Date().toISOString(), + workflowId, sourceTrajectories: trajectories.map((t) => t.id), dateRange: { start: minDate.toISOString(), @@ -560,7 +698,109 @@ function groupDecisions( ); } -function getDefaultOutputPath(compacted: CompactedTrajectory): string { +function shouldUseLLM( + options: Pick, + providerAvailable: boolean, +): boolean { + if (options.mechanical) { + return false; + } + + if (options.llm === false) { + return false; + } + + if (options.llm === true) { + return providerAvailable; + } + + return providerAvailable; +} + +function buildLLMCompactionPlan( + trajectories: Trajectory[], + focusAreas: string[], + maxInputTokens: number, + maxOutputTokens: number, +): LLMCompactionPlan { + const serialized = serializeForLLM(trajectories, maxInputTokens); + const messages = buildCompactionPrompt(serialized, { + focusAreas, + maxOutputTokens, + }); + + return { + messages, + estimatedInputTokens: estimateTokens( + messages.map((message) => message.content).join("\n\n"), + ), + estimatedOutputTokens: maxOutputTokens, + focusAreas, + }; +} + +function parseFocusAreas(focus?: string): string[] { + if (!focus) { + return []; + } + + return focus + .split(",") + .map((area) => area.trim()) + .filter(Boolean); +} + +function estimateTokens(text: string): number { + return Math.max(1, Math.ceil(text.length / 4)); +} + +function printLLMDryRun( + plan: LLMCompactionPlan, + model: string | undefined, + workflowId?: string, +): void { + console.log("=== DRY RUN - LLM Prompt Preview ===\n"); + console.log( + `Estimated: ~${plan.estimatedInputTokens} input tokens, ~${plan.estimatedOutputTokens} output tokens`, + ); + if (model) { + console.log(`Configured model: ${model}`); + } + if (workflowId) { + console.log(`Workflow: ${workflowId}`); + } + if (plan.focusAreas.length > 0) { + console.log(`Focus: ${plan.focusAreas.join(", ")}`); + } + console.log(""); + + for (const message of plan.messages) { + console.log(`[${message.role.toUpperCase()}]`); + console.log(message.content); + console.log(""); + } +} + +function getProviderLabel(provider: CompactionLLM): string { + if (provider instanceof OpenAIProvider) { + return "OpenAI"; + } + + if (provider instanceof AnthropicProvider) { + return "Anthropic"; + } + + if (provider instanceof CLIProvider) { + return `CLI (${provider.cliName})`; + } + + return "LLM"; +} + +function getDefaultOutputPath( + compacted: CompactedTrajectory, + workflowId?: string, +): string { const trajDir = process.env.TRAJECTORIES_DATA_DIR || ".trajectories"; const compactedDir = join(trajDir, "compacted"); @@ -568,25 +808,101 @@ function getDefaultOutputPath(compacted: CompactedTrajectory): string { mkdirSync(compactedDir, { recursive: true }); } + if (workflowId) { + return join(compactedDir, `workflow-${workflowId}.json`); + } + const dateStr = new Date().toISOString().slice(0, 10); return join(compactedDir, `${compacted.id}_${dateStr}.json`); } -function saveCompactedTrajectory( +function saveCompactionArtifacts( compacted: CompactedTrajectory, outputPath: string, + markdownEnabled: boolean, ): void { - const dir = join(outputPath, ".."); + const dir = dirname(outputPath); if (!existsSync(dir)) { mkdirSync(dir, { recursive: true }); } writeFileSync(outputPath, JSON.stringify(compacted, null, 2)); + + if (markdownEnabled) { + writeFileSync( + getMarkdownOutputPath(outputPath), + renderCompactionMarkdown(compacted), + ); + } +} + +function getMarkdownOutputPath(outputPath: string): string { + return outputPath.endsWith(".json") + ? outputPath.slice(0, -".json".length).concat(".md") + : `${outputPath}.md`; +} + +function renderCompactionMarkdown(compacted: CompactedTrajectory): string { + if (compacted.narrative) { + return generateCompactionMarkdown( + compacted as Parameters[0], + ); + } + + const decisionGroups = + compacted.decisionGroups.length > 0 + ? compacted.decisionGroups + .map((group) => { + const decisions = + group.decisions.length > 0 + ? group.decisions + .map( + (decision) => + `- ${decision.question} -> ${decision.chosen} (${decision.fromTrajectory})`, + ) + .join("\n") + : "- None"; + return `## ${capitalize(group.category)}\n${decisions}`; + }) + .join("\n\n") + : "## Decision Groups\n- None"; + const learnings = + compacted.keyLearnings.length > 0 + ? compacted.keyLearnings.map((learning) => `- ${learning}`).join("\n") + : "- None"; + const findings = + compacted.keyFindings.length > 0 + ? compacted.keyFindings.map((finding) => `- ${finding}`).join("\n") + : "- None"; + + return [ + `# Trajectory Compaction: ${formatDate(compacted.dateRange.start)} - ${formatDate(compacted.dateRange.end)}`, + "", + "## Summary", + `- Sessions: ${compacted.sourceTrajectories.length}`, + ...(compacted.workflowId ? [`- Workflow: ${compacted.workflowId}`] : []), + `- Decisions: ${compacted.summary.totalDecisions}`, + `- Events: ${compacted.summary.totalEvents}`, + `- Agents: ${compacted.summary.uniqueAgents.join(", ") || "None"}`, + `- Files: ${compacted.filesAffected.length}`, + `- Commits: ${compacted.commits.length}`, + "", + decisionGroups, + "", + "## Key Learnings", + learnings, + "", + "## Key Findings", + findings, + ].join("\n"); } function printCompactedSummary(compacted: CompactedTrajectory): void { console.log("=== Compacted Trajectory Summary ===\n"); console.log(`ID: ${compacted.id}`); + if (compacted.workflowId) { + console.log(`Workflow: ${compacted.workflowId}`); + } console.log(`Source trajectories: ${compacted.sourceTrajectories.length}`); console.log( `Date range: ${formatDate(compacted.dateRange.start)} - ${formatDate(compacted.dateRange.end)}`, @@ -596,30 +912,62 @@ function printCompactedSummary(compacted: CompactedTrajectory): void { console.log(`Agents: ${compacted.summary.uniqueAgents.join(", ")}`); console.log(""); - console.log("=== Decision Groups ===\n"); - for (const group of compacted.decisionGroups) { - console.log( - `${capitalize(group.category)} (${group.decisions.length} decisions):`, - ); - for (const decision of group.decisions.slice(0, 3)) { - console.log(` - ${decision.question}`); - console.log(` Chose: ${decision.chosen}`); - } - if (group.decisions.length > 3) { - console.log(` ... and ${group.decisions.length - 3} more`); - } + if (compacted.narrative) { + console.log("=== Narrative ===\n"); + console.log(compacted.narrative); console.log(""); - } - if (compacted.keyLearnings.length > 0) { - console.log("=== Key Learnings ===\n"); - for (const learning of compacted.keyLearnings.slice(0, 5)) { - console.log(` - ${learning}`); + if (compacted.decisions && compacted.decisions.length > 0) { + console.log("=== Key Decisions ===\n"); + for (const decision of compacted.decisions.slice(0, 5)) { + console.log(` - ${decision.question}`); + console.log(` Chosen: ${decision.chosen}`); + if (decision.impact) { + console.log(` Impact: ${decision.impact}`); + } + } + if (compacted.decisions.length > 5) { + console.log(` ... and ${compacted.decisions.length - 5} more`); + } + console.log(""); + } + + if (compacted.openQuestions && compacted.openQuestions.length > 0) { + console.log("=== Open Questions ===\n"); + for (const question of compacted.openQuestions.slice(0, 5)) { + console.log(` - ${question}`); + } + if (compacted.openQuestions.length > 5) { + console.log(` ... and ${compacted.openQuestions.length - 5} more`); + } + console.log(""); } - if (compacted.keyLearnings.length > 5) { - console.log(` ... and ${compacted.keyLearnings.length - 5} more`); + } else { + console.log("=== Decision Groups ===\n"); + for (const group of compacted.decisionGroups) { + console.log( + `${capitalize(group.category)} (${group.decisions.length} decisions):`, + ); + for (const decision of group.decisions.slice(0, 3)) { + console.log(` - ${decision.question}`); + console.log(` Chose: ${decision.chosen}`); + } + if (group.decisions.length > 3) { + console.log(` ... and ${group.decisions.length - 3} more`); + } + console.log(""); + } + + if (compacted.keyLearnings.length > 0) { + console.log("=== Key Learnings ===\n"); + for (const learning of compacted.keyLearnings.slice(0, 5)) { + console.log(` - ${learning}`); + } + if (compacted.keyLearnings.length > 5) { + console.log(` ... and ${compacted.keyLearnings.length - 5} more`); + } + console.log(""); } - console.log(""); } if (compacted.filesAffected.length > 0) { diff --git a/src/cli/commands/start.ts b/src/cli/commands/start.ts index ae0fafb..cc938da 100644 --- a/src/cli/commands/start.ts +++ b/src/cli/commands/start.ts @@ -20,6 +20,10 @@ export function registerStartCommand(program: Command): void { .option("--url ", "URL to external task") .option("-a, --agent ", "Agent name (or set TRAJECTORIES_AGENT)") .option("-p, --project ", "Project ID (or set TRAJECTORIES_PROJECT)") + .option( + "-w, --workflow ", + "Workflow run id (or set TRAJECTORIES_WORKFLOW_ID). Stamped onto the trajectory so `trail compact --workflow ` can collate a run.", + ) .option("-q, --quiet", "Only output trajectory ID (for scripting)") .action(async (title: string, options) => { const storage = new FileStorage(); @@ -55,6 +59,15 @@ export function registerStartCommand(program: Command): void { const projectId = options.project ?? process.env.TRAJECTORIES_PROJECT ?? undefined; + // Resolve workflow id from CLI flag or env var. When set, the trajectory + // is stamped so `trail compact --workflow ` can collate an entire + // relay workflow run into one tight artifact. + const workflowId = + (typeof options.workflow === "string" && options.workflow.trim()) || + (typeof process.env.TRAJECTORIES_WORKFLOW_ID === "string" && + process.env.TRAJECTORIES_WORKFLOW_ID.trim()) || + undefined; + // Capture git state for trace tracking const startRef = captureGitState(); @@ -65,6 +78,10 @@ export function registerStartCommand(program: Command): void { projectId, }); + if (workflowId) { + trajectory = { ...trajectory, workflowId }; + } + // Add trace reference if in a git repo if (startRef) { trajectory = { diff --git a/src/compact/config.ts b/src/compact/config.ts new file mode 100644 index 0000000..da22136 --- /dev/null +++ b/src/compact/config.ts @@ -0,0 +1,136 @@ +import { existsSync, readFileSync } from "node:fs"; +import { join } from "node:path"; +import { getSearchPaths } from "../storage/file.js"; + +export interface CompactionConfig { + provider: string; + model: string | undefined; + maxInputTokens: number; + maxOutputTokens: number; + temperature: number; +} + +const DEFAULT_CONFIG: CompactionConfig = { + provider: "auto", + model: undefined, + maxInputTokens: 30000, + maxOutputTokens: 4000, + temperature: 0.3, +}; + +export function getCompactionConfig(): CompactionConfig { + const fileConfig = loadFileConfig(); + + return { + provider: + readStringEnv("TRAJECTORIES_LLM_PROVIDER") ?? + readString(fileConfig.provider) ?? + DEFAULT_CONFIG.provider, + model: + readStringEnv("TRAJECTORIES_LLM_MODEL") ?? + readString(fileConfig.model) ?? + DEFAULT_CONFIG.model, + maxInputTokens: + readNumberEnv("TRAJECTORIES_LLM_MAX_INPUT_TOKENS") ?? + readNumber(fileConfig.maxInputTokens) ?? + DEFAULT_CONFIG.maxInputTokens, + maxOutputTokens: + readNumberEnv("TRAJECTORIES_LLM_MAX_OUTPUT_TOKENS") ?? + readNumber(fileConfig.maxOutputTokens) ?? + DEFAULT_CONFIG.maxOutputTokens, + temperature: + readNumberEnv("TRAJECTORIES_LLM_TEMPERATURE") ?? + readNumber(fileConfig.temperature) ?? + DEFAULT_CONFIG.temperature, + }; +} + +function loadFileConfig(): Partial { + const configPath = join(getPrimaryConfigDir(), "config.json"); + if (!existsSync(configPath)) { + return {}; + } + + try { + const raw = JSON.parse(readFileSync(configPath, "utf-8")) as unknown; + if (!isRecord(raw)) { + return {}; + } + + // Merge precedence (last wins): root < compaction < llm + // e.g. { "model": "x", "compaction": { "model": "y" }, "llm": { "model": "z" } } + // results in model = "z" + const merged: Record = {}; + for (const section of [raw, raw.compaction, raw.llm]) { + if (!isRecord(section)) { + continue; + } + + for (const [key, value] of Object.entries(section)) { + if (key === "compaction" || key === "llm") { + continue; + } + merged[key] = value; + } + } + + return { + provider: readString(merged.provider), + model: readString(merged.model), + maxInputTokens: readNumber(merged.maxInputTokens), + maxOutputTokens: readNumber(merged.maxOutputTokens), + temperature: readNumber(merged.temperature), + }; + } catch { + return {}; + } +} + +function getPrimaryConfigDir(): string { + const searchPaths = getSearchPaths(); + return searchPaths[0] ?? join(process.cwd(), ".trajectories"); +} + +function readStringEnv(name: string): string | undefined { + return readString(process.env[name]); +} + +function readNumberEnv(name: string): number | undefined { + return readNumber(process.env[name]); +} + +function readString(value: unknown): string | undefined { + if (typeof value !== "string") { + return undefined; + } + + const trimmed = value.trim(); + return trimmed.length > 0 ? trimmed : undefined; +} + +function readNumber(value: unknown): number | undefined { + if (typeof value === "number" && Number.isFinite(value)) { + return value; + } + + if (typeof value !== "string") { + return undefined; + } + + // Treat empty / whitespace-only strings as unset, matching readString's + // behavior. Otherwise `Number("") === 0` would silently override the + // default maxInputTokens / maxOutputTokens (nullish coalescing does NOT + // fall back for 0), truncating serialized trajectories to an empty + // string or sending max_tokens: 0 to the LLM API. + const trimmed = value.trim(); + if (trimmed.length === 0) { + return undefined; + } + + const parsed = Number(trimmed); + return Number.isFinite(parsed) ? parsed : undefined; +} + +function isRecord(value: unknown): value is Record { + return typeof value === "object" && value !== null; +} diff --git a/src/compact/index.ts b/src/compact/index.ts new file mode 100644 index 0000000..a4adaac --- /dev/null +++ b/src/compact/index.ts @@ -0,0 +1,10 @@ +export * from "./config.js"; +export * from "./markdown.js"; +export * from "./parser.js"; +export { + COMPACTED_OUTPUT_SCHEMA, + COMPACTION_SYSTEM_PROMPT, + buildCompactionPrompt, +} from "./prompts.js"; +export * from "./provider.js"; +export * from "./serializer.js"; diff --git a/src/compact/markdown.ts b/src/compact/markdown.ts new file mode 100644 index 0000000..3fadc04 --- /dev/null +++ b/src/compact/markdown.ts @@ -0,0 +1,83 @@ +import type { CompactedTrajectory, LLMCompactedOutput } from "./parser.js"; + +export function generateCompactionMarkdown( + compacted: CompactedTrajectory & LLMCompactedOutput, +): string { + const dateRange = `${formatDate(compacted.dateRange.start)} - ${formatDate(compacted.dateRange.end)}`; + const agents = + compacted.summary.uniqueAgents.length > 0 + ? compacted.summary.uniqueAgents.join(", ") + : "None"; + const decisionRows = + compacted.decisions.length > 0 + ? compacted.decisions + .map( + (decision) => + `| ${escapeTableCell(decision.question)} | ${escapeTableCell(decision.chosen)} | ${escapeTableCell(decision.impact)} |`, + ) + .join("\n") + : "| None identified | | |"; + const conventions = + compacted.conventions.length > 0 + ? compacted.conventions + .map( + (convention) => + `- **${convention.pattern || "Unnamed pattern"}**: ${convention.rationale || "No rationale captured."} (scope: ${convention.scope || "unspecified"})`, + ) + .join("\n") + : "- None established."; + const lessons = + compacted.lessons.length > 0 + ? compacted.lessons + .map((lesson) => { + const context = lesson.context ? ` (${lesson.context})` : ""; + const recommendation = lesson.recommendation + ? ` - ${lesson.recommendation}` + : ""; + return `- ${lesson.lesson}${context}${recommendation}`; + }) + .join("\n") + : "- None captured."; + const openQuestions = + compacted.openQuestions.length > 0 + ? compacted.openQuestions.map((question) => `- ${question}`).join("\n") + : "- None."; + + return [ + `# Trajectory Compaction: ${dateRange}`, + "", + "## Summary", + compacted.narrative || "No narrative available.", + "", + `## Key Decisions (${compacted.decisions.length})`, + "| Question | Decision | Impact |", + "|----------|----------|--------|", + decisionRows, + "", + "## Conventions Established", + conventions, + "", + "## Lessons Learned", + lessons, + "", + "## Open Questions", + openQuestions, + "", + "## Stats", + `- Sessions: ${compacted.sourceTrajectories.length}, Agents: ${agents}, Files: ${compacted.filesAffected.length}, Commits: ${compacted.commits.length}`, + `- Date range: ${compacted.dateRange.start} - ${compacted.dateRange.end}`, + ].join("\n"); +} + +function formatDate(value: string): string { + const date = new Date(value); + if (Number.isNaN(date.getTime())) { + return value; + } + + return date.toISOString().slice(0, 10); +} + +function escapeTableCell(value: string): string { + return value.replace(/\|/g, "\\|").replace(/\n/g, " "); +} diff --git a/src/compact/parser.ts b/src/compact/parser.ts new file mode 100644 index 0000000..43d0d32 --- /dev/null +++ b/src/compact/parser.ts @@ -0,0 +1,486 @@ +export interface CompactedDecision { + question: string; + chosen: string; + reasoning: string; + impact: string; +} + +export interface CompactedConvention { + pattern: string; + rationale: string; + scope: string; +} + +export interface CompactedLesson { + lesson: string; + context: string; + recommendation: string; +} + +export interface LLMCompactedOutput { + narrative: string; + decisions: CompactedDecision[]; + conventions: CompactedConvention[]; + lessons: CompactedLesson[]; + openQuestions: string[]; +} + +export interface CompactedTrajectoryMetadata { + id: string; + version: number; + type: "compacted"; + compactedAt: string; + sourceTrajectories: string[]; + dateRange: { + start: string; + end: string; + }; + summary: { + totalDecisions: number; + totalEvents: number; + uniqueAgents: string[]; + }; + filesAffected: string[]; + commits: string[]; +} + +export type CompactedTrajectory = CompactedTrajectoryMetadata & + LLMCompactedOutput; + +export function parseCompactionResponse(llmOutput: string): LLMCompactedOutput { + const trimmed = llmOutput.trim(); + const parsedJson = + parseJsonCandidate(trimmed) ?? + parseJsonCandidate(extractFirstMarkdownJsonBlock(trimmed)) ?? + parseJsonCandidate(extractBalancedJsonObject(trimmed)); + + if (parsedJson) { + return normalizeCompactionOutput(parsedJson, trimmed); + } + + return normalizeCompactionOutput(extractFromProse(trimmed), trimmed); +} + +export function mergeCompactionWithMetadata( + metadata: CompactedTrajectoryMetadata, + llmOutput: LLMCompactedOutput, +): CompactedTrajectory { + return { + ...metadata, + ...llmOutput, + }; +} + +function parseJsonCandidate(candidate: string | null): unknown | null { + if (!candidate) { + return null; + } + + try { + return JSON.parse(candidate); + } catch { + return null; + } +} + +function extractFirstMarkdownJsonBlock(text: string): string | null { + const match = text.match(/```(?:json)?\s*([\s\S]*?)```/i); + return match ? match[1].trim() : null; +} + +function extractBalancedJsonObject(text: string): string | null { + const start = text.indexOf("{"); + if (start === -1) { + return null; + } + + let depth = 0; + let inString = false; + let escaped = false; + + for (let index = start; index < text.length; index += 1) { + const char = text[index]; + + if (escaped) { + escaped = false; + continue; + } + + if (char === "\\") { + escaped = true; + continue; + } + + if (char === '"') { + inString = !inString; + continue; + } + + if (inString) { + continue; + } + + if (char === "{") { + depth += 1; + } else if (char === "}") { + depth -= 1; + if (depth === 0) { + return text.slice(start, index + 1); + } + } + } + + return null; +} + +function extractFromProse(text: string): Partial { + const sections = splitSections(text); + const narrativeSection = + sections.narrative ?? sections.summary ?? leadingNarrative(text); + + return { + narrative: normalizeText(narrativeSection), + decisions: parseDecisionSection( + sections["key decisions"] ?? sections.decisions ?? "", + ), + conventions: parseConventionSection( + sections["conventions established"] ?? sections.conventions ?? "", + ), + lessons: parseLessonSection( + sections["lessons learned"] ?? sections.lessons ?? "", + ), + openQuestions: parseStringList( + sections["open questions"] ?? sections.questions ?? "", + ), + }; +} + +function splitSections(text: string): Record { + const matches = [...text.matchAll(/^##+\s+(.+?)\s*$/gm)]; + const sections: Record = {}; + + for (let index = 0; index < matches.length; index += 1) { + const current = matches[index]; + const next = matches[index + 1]; + const title = normalizeHeading(current[1]); + const start = + current.index === undefined ? 0 : current.index + current[0].length; + const end = next?.index ?? text.length; + sections[title] = text.slice(start, end).trim(); + } + + return sections; +} + +function normalizeHeading(value: string): string { + return value + .toLowerCase() + .replace(/\(\d+\)/g, "") + .replace(/[^a-z0-9\s]/g, " ") + .replace(/\s+/g, " ") + .trim(); +} + +function leadingNarrative(text: string): string { + const beforeHeading = text.split(/^##+\s+/m, 1)[0] ?? ""; + const withoutCode = beforeHeading.replace(/```[\s\S]*?```/g, "").trim(); + return withoutCode; +} + +function normalizeCompactionOutput( + raw: unknown, + fallbackNarrativeSource: string, +): LLMCompactedOutput { + const candidate = isRecord(raw) ? raw : {}; + + const narrative = normalizeText( + typeof candidate.narrative === "string" + ? candidate.narrative + : typeof candidate.summary === "string" + ? candidate.summary + : typeof candidate.overview === "string" + ? candidate.overview + : leadingNarrative(fallbackNarrativeSource), + ); + + return { + narrative: + narrative || + normalizeText(fallbackNarrativeSource) || + "No narrative provided.", + decisions: normalizeDecisionArray(candidate.decisions), + conventions: normalizeConventionArray(candidate.conventions), + lessons: normalizeLessonArray(candidate.lessons), + openQuestions: normalizeStringArray( + candidate.openQuestions ?? + candidate.open_questions ?? + candidate.questions, + ), + }; +} + +function normalizeDecisionArray(value: unknown): CompactedDecision[] { + if (!Array.isArray(value)) { + return []; + } + + return value + .map((entry) => { + if (typeof entry === "string") { + return { + question: normalizeText(entry), + chosen: "", + reasoning: "", + impact: "", + }; + } + + if (!isRecord(entry)) { + return null; + } + + return { + question: readString(entry, ["question", "prompt", "topic"]), + chosen: readString(entry, ["chosen", "decision", "answer"]), + reasoning: readString(entry, ["reasoning", "why", "rationale"]), + impact: readString(entry, ["impact", "result", "outcome"]), + }; + }) + .filter((entry): entry is CompactedDecision => { + return entry !== null && hasContent(Object.values(entry)); + }); +} + +function normalizeConventionArray(value: unknown): CompactedConvention[] { + if (!Array.isArray(value)) { + return []; + } + + return value + .map((entry) => { + if (typeof entry === "string") { + return { + pattern: normalizeText(entry), + rationale: "", + scope: "", + }; + } + + if (!isRecord(entry)) { + return null; + } + + return { + pattern: readString(entry, ["pattern", "rule", "convention"]), + rationale: readString(entry, ["rationale", "reasoning", "why"]), + scope: readString(entry, ["scope", "appliesTo", "applies_to"]), + }; + }) + .filter((entry): entry is CompactedConvention => { + return entry !== null && hasContent(Object.values(entry)); + }); +} + +function normalizeLessonArray(value: unknown): CompactedLesson[] { + if (!Array.isArray(value)) { + return []; + } + + return value + .map((entry) => { + if (typeof entry === "string") { + return { + lesson: normalizeText(entry), + context: "", + recommendation: "", + }; + } + + if (!isRecord(entry)) { + return null; + } + + return { + lesson: readString(entry, ["lesson", "learning", "takeaway"]), + context: readString(entry, ["context", "situation", "when"]), + recommendation: readString(entry, [ + "recommendation", + "suggestion", + "nextStep", + "next_step", + ]), + }; + }) + .filter((entry): entry is CompactedLesson => { + return entry !== null && hasContent(Object.values(entry)); + }); +} + +function normalizeStringArray(value: unknown): string[] { + if (!Array.isArray(value)) { + return []; + } + + return value + .map((entry) => (typeof entry === "string" ? normalizeText(entry) : "")) + .filter(Boolean); +} + +function parseDecisionSection(section: string): CompactedDecision[] { + const tableDecisions = parseMarkdownTable(section).map((row) => ({ + question: row[0] ?? "", + chosen: row[1] ?? "", + reasoning: row[2] ?? "", + impact: row[3] ?? row[2] ?? "", + })); + + if (tableDecisions.length > 0) { + return tableDecisions.filter((entry) => hasContent(Object.values(entry))); + } + + return parseListItems(section) + .map((item) => { + const fields = parseFieldMap(item); + return { + question: + fields.question ?? + fields.prompt ?? + fields.topic ?? + fields.title ?? + item, + chosen: fields.chosen ?? fields.decision ?? fields.answer ?? "", + reasoning: fields.reasoning ?? fields.rationale ?? fields.why ?? "", + impact: fields.impact ?? fields.outcome ?? fields.result ?? "", + }; + }) + .filter((entry) => hasContent(Object.values(entry))); +} + +function parseConventionSection(section: string): CompactedConvention[] { + return parseListItems(section) + .map((item) => { + const emphasized = item.match(/^\*\*(.+?)\*\*:\s*(.+)$/); + const scopeMatch = item.match(/\((?:scope|applies to):\s*([^)]+)\)\s*$/i); + const withoutScope = scopeMatch + ? item.slice(0, scopeMatch.index).trim() + : item; + + if (emphasized) { + return { + pattern: normalizeText(emphasized[1]), + rationale: normalizeText( + withoutScope.replace(/^\*\*(.+?)\*\*:\s*/, ""), + ), + scope: normalizeText(scopeMatch?.[1] ?? ""), + }; + } + + const fields = parseFieldMap(item); + return { + pattern: fields.pattern ?? fields.convention ?? fields.rule ?? item, + rationale: fields.rationale ?? fields.reasoning ?? fields.why ?? "", + scope: fields.scope ?? fields.applies ?? "", + }; + }) + .filter((entry) => hasContent(Object.values(entry))); +} + +function parseLessonSection(section: string): CompactedLesson[] { + return parseListItems(section) + .map((item) => { + const fields = parseFieldMap(item); + const dashParts = item.split(/\s[—-]\s/, 2); + + return { + lesson: + fields.lesson ?? + fields.learning ?? + fields.takeaway ?? + dashParts[0] ?? + item, + context: fields.context ?? "", + recommendation: + fields.recommendation ?? + fields.suggestion ?? + fields.nextstep ?? + dashParts[1] ?? + "", + }; + }) + .filter((entry) => hasContent(Object.values(entry))); +} + +function parseStringList(section: string): string[] { + return parseListItems(section).map(normalizeText).filter(Boolean); +} + +function parseMarkdownTable(section: string): string[][] { + const lines = section + .split("\n") + .map((line) => line.trim()) + .filter((line) => line.startsWith("|")); + + if (lines.length < 2) { + return []; + } + + return lines + .slice(1) + .filter((line) => !/^\|?\s*:?-{3,}/.test(line.replace(/\|/g, ""))) + .map((line) => + line + .split("|") + .slice(1, -1) + .map((cell) => normalizeText(cell)), + ); +} + +function parseListItems(section: string): string[] { + return section + .split("\n") + .map((line) => line.trim()) + .filter((line) => /^[-*] |\d+\.\s/.test(line)) + .map((line) => line.replace(/^[-*]\s+|\d+\.\s+/, "").trim()) + .filter(Boolean); +} + +function parseFieldMap(item: string): Record { + const normalized = item.replace(/\s+\|\s+/g, "; "); + const segments = normalized.split(/;\s+/); + const fields: Record = {}; + + for (const segment of segments) { + const match = segment.match(/^([A-Za-z ]+):\s*(.+)$/); + if (!match) { + continue; + } + + const key = match[1].toLowerCase().replace(/\s+/g, ""); + fields[key] = normalizeText(match[2]); + } + + return fields; +} + +function readString(record: Record, keys: string[]): string { + for (const key of keys) { + const value = record[key]; + if (typeof value === "string") { + return normalizeText(value); + } + } + + return ""; +} + +function isRecord(value: unknown): value is Record { + return typeof value === "object" && value !== null; +} + +function normalizeText(value: string): string { + return value.replace(/\r\n/g, "\n").replace(/\s+/g, " ").trim(); +} + +function hasContent(values: string[]): boolean { + return values.some((value) => value.trim().length > 0); +} diff --git a/src/compact/prompts.ts b/src/compact/prompts.ts new file mode 100644 index 0000000..40d5407 --- /dev/null +++ b/src/compact/prompts.ts @@ -0,0 +1,96 @@ +export interface Message { + role: "system" | "user"; + content: string; +} + +export interface PromptOptions { + focusAreas?: string[]; + maxOutputTokens?: number; +} + +export const COMPACTION_SYSTEM_PROMPT = `You are a technical analyst reviewing agent work sessions (trajectories). +Your job is to produce a concise, insightful summary that captures: +- What was accomplished and how +- Key decisions and their reasoning +- Patterns/conventions established that should be followed in future work +- Lessons learned from challenges and failures +- Open questions or unresolved issues + +Be specific. Reference actual file paths, function names, and technical details. +Don't be generic - this summary replaces the raw data.`; + +export const COMPACTED_OUTPUT_SCHEMA = `{ + "narrative": "string", + "decisions": [ + { + "question": "string", + "chosen": "string", + "reasoning": "string", + "impact": "string" + } + ], + "conventions": [ + { + "pattern": "string", + "rationale": "string", + "scope": "string" + } + ], + "lessons": [ + { + "lesson": "string", + "context": "string", + "recommendation": "string" + } + ], + "openQuestions": ["string"] +}`; + +export function buildCompactionPrompt( + serializedTrajectories: string, + options: PromptOptions = {}, +): Message[] { + const focusAreas = + options.focusAreas && options.focusAreas.length > 0 + ? options.focusAreas.map((area) => `- ${area}`).join("\n") + : [ + "- What work was attempted, completed, or abandoned", + "- Why specific technical decisions were made", + "- Which conventions should carry forward", + "- What broke, what worked, and what should change next time", + ].join("\n"); + + const maxOutputInstruction = options.maxOutputTokens + ? `Keep the full response within approximately ${options.maxOutputTokens} tokens while preserving technical specificity.` + : "Keep the response concise, dense with signal, and avoid filler."; + + const userPrompt = [ + "Review the following serialized agent trajectories and return a single JSON object.", + "The JSON must match this schema exactly:", + COMPACTED_OUTPUT_SCHEMA, + "", + "Requirements:", + "- Output raw JSON only. Do not wrap it in markdown fences.", + "- `narrative` should be 2-3 tight paragraphs.", + "- `decisions`, `conventions`, and `lessons` must always be arrays, even if empty.", + "- Prefer concrete file paths, symbols, commands, and implementation details over generic summaries.", + maxOutputInstruction, + "", + "Focus areas:", + focusAreas, + "", + "Serialized trajectories:", + serializedTrajectories.trim(), + ].join("\n"); + + return [ + { + role: "system", + content: COMPACTION_SYSTEM_PROMPT, + }, + { + role: "user", + content: userPrompt, + }, + ]; +} diff --git a/src/compact/provider.ts b/src/compact/provider.ts new file mode 100644 index 0000000..d19df55 --- /dev/null +++ b/src/compact/provider.ts @@ -0,0 +1,472 @@ +import { execFile, spawn } from "node:child_process"; +import { constants, accessSync } from "node:fs"; +import { homedir } from "node:os"; +import { join } from "node:path"; +import { promisify } from "node:util"; +import type { CompactionConfig } from "./config.js"; + +const execFileAsync = promisify(execFile); + +// Note: extends prompts.ts Message with additional "assistant" role for provider responses +export interface Message { + role: "system" | "user" | "assistant"; + content: string; +} + +export interface CompletionOptions { + maxTokens?: number; + temperature?: number; + jsonMode?: boolean; +} + +export interface CompactionLLM { + complete(messages: Message[], options?: CompletionOptions): Promise; +} + +interface ProviderConfig { + apiKey: string; + model: string; + baseUrl: string; +} + +interface OpenAIChatResponse { + choices?: Array<{ + message?: { + content?: string | null; + }; + }>; + error?: { + message?: string; + }; +} + +interface AnthropicResponse { + content?: Array< + | { + type: "text"; + text: string; + } + | { + type: string; + } + >; + error?: { + message?: string; + }; +} + +export const DEFAULT_OPENAI_MODEL = "gpt-4o"; +export const DEFAULT_ANTHROPIC_MODEL = "claude-sonnet-4-20250514"; +const DEFAULT_OPENAI_BASE_URL = "https://api.openai.com"; +const DEFAULT_ANTHROPIC_BASE_URL = "https://api.anthropic.com"; +const DEFAULT_MAX_TOKENS = 4096; + +export class OpenAIProvider implements CompactionLLM { + private readonly apiKey: string; + private readonly model: string; + private readonly baseUrl: string; + + constructor(config: Partial = {}) { + this.apiKey = + config.apiKey?.trim() || process.env.OPENAI_API_KEY?.trim() || ""; + this.model = + normalizeModel(config.model) ?? + normalizeModel(process.env.TRAJECTORIES_LLM_MODEL) ?? + DEFAULT_OPENAI_MODEL; + this.baseUrl = + config.baseUrl ?? process.env.OPENAI_BASE_URL ?? DEFAULT_OPENAI_BASE_URL; + + if (this.baseUrl !== DEFAULT_OPENAI_BASE_URL) { + console.warn( + `[trajectories] OpenAI base URL overridden to: ${this.baseUrl}`, + ); + } + + if (!this.apiKey) { + throw new Error("OPENAI_API_KEY is required for OpenAIProvider"); + } + } + + async complete( + messages: Message[], + options: CompletionOptions = {}, + ): Promise { + const controller = new AbortController(); + const timeout = setTimeout(() => controller.abort(), 300_000); + try { + const response = await fetch(`${this.baseUrl}/v1/chat/completions`, { + method: "POST", + headers: { + Authorization: `Bearer ${this.apiKey}`, + "Content-Type": "application/json", + }, + body: JSON.stringify({ + model: this.model, + messages, + max_tokens: options.maxTokens ?? DEFAULT_MAX_TOKENS, + temperature: options.temperature ?? 0.2, + response_format: options.jsonMode + ? { type: "json_object" } + : undefined, + }), + signal: controller.signal, + }); + + const body = (await parseJson(response)) as OpenAIChatResponse; + if (!response.ok) { + throw new Error( + body.error?.message ?? + `OpenAI request failed with status ${response.status}`, + ); + } + + const content = body.choices?.[0]?.message?.content; + if (!content) { + throw new Error("OpenAI response did not include completion content"); + } + + return content; + } finally { + clearTimeout(timeout); + } + } +} + +export class AnthropicProvider implements CompactionLLM { + private readonly apiKey: string; + private readonly model: string; + private readonly baseUrl: string; + + constructor(config: Partial = {}) { + this.apiKey = + config.apiKey?.trim() || process.env.ANTHROPIC_API_KEY?.trim() || ""; + this.model = + normalizeModel(config.model) ?? + normalizeModel(process.env.TRAJECTORIES_LLM_MODEL) ?? + DEFAULT_ANTHROPIC_MODEL; + this.baseUrl = + config.baseUrl ?? + process.env.ANTHROPIC_BASE_URL ?? + DEFAULT_ANTHROPIC_BASE_URL; + + if (this.baseUrl !== DEFAULT_ANTHROPIC_BASE_URL) { + console.warn( + `[trajectories] Anthropic base URL overridden to: ${this.baseUrl}`, + ); + } + + if (!this.apiKey) { + throw new Error("ANTHROPIC_API_KEY is required for AnthropicProvider"); + } + } + + async complete( + messages: Message[], + options: CompletionOptions = {}, + ): Promise { + const systemMessages = messages + .filter((message) => message.role === "system") + .map((message) => message.content.trim()) + .filter(Boolean); + + const conversation = messages + .filter((message) => message.role !== "system") + .map((message) => ({ + role: message.role, + content: message.content, + })); + + if (conversation.length === 0) { + throw new Error("AnthropicProvider requires at least one user message"); + } + + const controller = new AbortController(); + const timeout = setTimeout(() => controller.abort(), 300_000); + try { + const response = await fetch(`${this.baseUrl}/v1/messages`, { + method: "POST", + headers: { + "Content-Type": "application/json", + "anthropic-version": "2024-10-22", + "x-api-key": this.apiKey, + }, + body: JSON.stringify({ + model: this.model, + system: + systemMessages.length > 0 ? systemMessages.join("\n\n") : undefined, + messages: conversation, + max_tokens: options.maxTokens ?? DEFAULT_MAX_TOKENS, + temperature: options.temperature ?? 0.2, + }), + signal: controller.signal, + }); + + const body = (await parseJson(response)) as AnthropicResponse; + if (!response.ok) { + throw new Error( + body.error?.message ?? + `Anthropic request failed with status ${response.status}`, + ); + } + + const textBlocks = (body.content ?? []).filter( + ( + block, + ): block is Extract< + AnthropicResponse["content"], + Array + >[number] & { + type: "text"; + text: string; + } => + block.type === "text" && + typeof (block as { text?: unknown }).text === "string", + ); + const content = textBlocks + .map((block) => block.text) + .join("\n") + .trim(); + + if (!content) { + throw new Error("Anthropic response did not include text content"); + } + + return content; + } finally { + clearTimeout(timeout); + } + } +} + +/** + * Kept inline (not imported from @agent-relay/sdk) because that dep was removed in 7e9783c. When agent-relay ships new compaction-capable CLIs, add them here manually. + */ +const SUPPORTED_CLIS = ["claude", "codex", "gemini", "opencode"] as const; +type SupportedCli = (typeof SUPPORTED_CLIS)[number]; + +export class CLIProvider implements CompactionLLM { + private readonly cli: SupportedCli; + private readonly binaryPath: string; + + constructor(cli: SupportedCli, binaryPath: string) { + this.cli = cli; + this.binaryPath = binaryPath; + } + + get cliName(): string { + return this.cli; + } + + async complete( + messages: Message[], + _options: CompletionOptions = {}, + ): Promise { + const prompt = messagesToPrompt(messages); + const args = buildCliArgs(this.cli); + + // Use stdin to avoid OS argument length limits for large prompts + const output = await spawnWithStdin(this.binaryPath, args, prompt); + if (!output) { + throw new Error(`${this.cli} CLI returned empty output`); + } + + return output; + } +} + +function messagesToPrompt(messages: Message[]): string { + const systemParts: string[] = []; + const conversationParts: string[] = []; + + for (const msg of messages) { + if (msg.role === "system") { + systemParts.push(msg.content.trim()); + } else { + conversationParts.push(msg.content.trim()); + } + } + + const parts: string[] = []; + if (systemParts.length > 0) { + parts.push(systemParts.join("\n\n")); + } + if (conversationParts.length > 0) { + parts.push(conversationParts.join("\n\n")); + } + + return parts.join("\n\n---\n\n"); +} + +function buildCliArgs(cli: SupportedCli): string[] { + switch (cli) { + case "claude": + return ["-p", "--output-format", "text"]; + case "codex": + return ["exec", "-q"]; + case "gemini": + return ["-p"]; + case "opencode": + return ["run", "--no-color"]; + } +} + +function spawnWithStdin( + command: string, + args: string[], + input: string, +): Promise { + return new Promise((resolve, reject) => { + const child = spawn(command, args, { + timeout: 300_000, + stdio: ["pipe", "pipe", "pipe"], + }); + + const chunks: Buffer[] = []; + child.stdout.on("data", (chunk: Buffer) => chunks.push(chunk)); + + let stderr = ""; + child.stderr.on("data", (chunk: Buffer) => { + stderr += chunk.toString(); + }); + + child.on("error", reject); + child.on("close", (code) => { + if (code !== 0) { + reject( + new Error(`CLI exited with code ${code}: ${stderr.slice(0, 200)}`), + ); + } else { + resolve(Buffer.concat(chunks).toString().trim()); + } + }); + + child.stdin.write(input); + child.stdin.end(); + }); +} + +export async function resolveProvider( + config: Partial = {}, +): Promise { + const explicitProvider = ( + config.provider ?? process.env.TRAJECTORIES_LLM_PROVIDER + )?.toLowerCase(); + const model = normalizeModel(config.model); + + if (explicitProvider === "openai") { + return process.env.OPENAI_API_KEY ? new OpenAIProvider({ model }) : null; + } + + if (explicitProvider === "anthropic") { + return process.env.ANTHROPIC_API_KEY + ? new AnthropicProvider({ model }) + : null; + } + + if (explicitProvider === "cli") { + return resolveCLIProvider(); + } + + if (explicitProvider && explicitProvider !== "auto") { + return null; + } + + const cliProvider = await resolveCLIProvider(); + if (cliProvider) { + return cliProvider; + } + + if (process.env.OPENAI_API_KEY) { + return new OpenAIProvider({ model }); + } + + if (process.env.ANTHROPIC_API_KEY) { + return new AnthropicProvider({ model }); + } + + return null; +} + +const CLI_SEARCH_PATHS = [ + "~/.local/bin", + "~/.claude/local", + "/usr/local/bin", + "/opt/homebrew/bin", +]; + +async function resolveCLIProvider(): Promise { + const requestedCli = process.env.TRAJECTORIES_LLM_CLI?.trim().toLowerCase(); + const clisToTry = (() => { + if (!requestedCli) { + return SUPPORTED_CLIS; + } + + if ((SUPPORTED_CLIS as readonly string[]).includes(requestedCli)) { + return [requestedCli as SupportedCli]; + } + + console.warn( + `[trajectories] Unsupported TRAJECTORIES_LLM_CLI value "${requestedCli}", falling back to auto-detect`, + ); + return SUPPORTED_CLIS; + })(); + + for (const cli of clisToTry) { + const path = await findBinary(cli); + if (path) { + return new CLIProvider(cli, path); + } + } + + return null; +} + +async function findBinary(name: string): Promise { + // Try PATH first via `which` + try { + const { stdout } = await execFileAsync("which", [name]); + const path = stdout.trim(); + if (path) return path; + } catch { + // not in PATH + } + + // Fall back to well-known install directories + const home = homedir(); + for (const dir of CLI_SEARCH_PATHS) { + const expanded = dir.startsWith("~/") ? join(home, dir.slice(2)) : dir; + const candidate = join(expanded, name); + try { + accessSync(candidate, constants.X_OK); + return candidate; + } catch { + // not found here + } + } + + return undefined; +} + +function normalizeModel(value: string | undefined): string | undefined { + if (typeof value !== "string") { + return undefined; + } + + const trimmed = value.trim(); + return trimmed.length > 0 ? trimmed : undefined; +} + +async function parseJson(response: Response): Promise { + const text = await response.text(); + if (!text) { + return {}; + } + + try { + return JSON.parse(text) as unknown; + } catch { + throw new Error( + `Invalid JSON response (status ${response.status}, length ${text.length})`, + ); + } +} diff --git a/src/compact/serializer.ts b/src/compact/serializer.ts new file mode 100644 index 0000000..b5a1d42 --- /dev/null +++ b/src/compact/serializer.ts @@ -0,0 +1,438 @@ +import type { + Chapter, + Decision, + EventSignificance, + Finding, + Retrospective, + Trajectory, + TrajectoryEvent, +} from "../core/types.js"; + +const DEFAULT_MAX_TOKENS = 30000; +const CHARS_PER_TOKEN = 4; +const INCLUDED_SIGNIFICANCE = new Set([ + "medium", + "high", + "critical", +]); + +interface SessionRender { + header: string; + agents: string; + chapters: string[]; + decisions: string; + findings: string; + retrospective: string; + filesAndCommits: string; +} + +export function serializeForLLM( + trajectories: Trajectory[], + maxTokens = DEFAULT_MAX_TOKENS, +): string { + const maxChars = Math.max(0, maxTokens * CHARS_PER_TOKEN); + const sessions = trajectories.map(renderSession); + + let document = joinSessions(sessions); + if (document.length <= maxChars || sessions.length === 0) { + return document; + } + + const fixedChars = sessions.reduce( + (total, session) => + total + + session.header.length + + session.agents.length + + session.decisions.length + + session.findings.length + + session.retrospective.length + + session.filesAndCommits.length, + 0, + ); + const chapterChars = sessions.reduce( + (total, session) => + total + + session.chapters.reduce((sum, chapter) => sum + chapter.length, 0), + 0, + ); + + const remainingChapterChars = maxChars - fixedChars; + if (remainingChapterChars <= 0 || chapterChars === 0) { + return truncateText(document, maxChars); + } + + const ratio = Math.min(1, remainingChapterChars / chapterChars); + const truncatedSessions = sessions.map((session) => ({ + ...session, + chapters: truncateChapters( + session.chapters, + session.chapters.reduce((sum, chapter) => sum + chapter.length, 0), + ratio, + ), + })); + + document = joinSessions(truncatedSessions); + return document.length <= maxChars + ? document + : truncateText(document, maxChars); +} + +function renderSession(trajectory: Trajectory): SessionRender { + const sessionTitle = trajectory.task.title.trim() || trajectory.id; + const duration = formatDuration(trajectory.startedAt, trajectory.completedAt); + const header = [ + `## Session: ${sessionTitle} (${trajectory.status}, ${duration})`, + trajectory.task.description + ? `Description: ${trajectory.task.description}` + : "", + `Started: ${trajectory.startedAt}`, + trajectory.completedAt ? `Completed: ${trajectory.completedAt}` : "", + ] + .filter(Boolean) + .join("\n") + .concat("\n"); + + const agents = + trajectory.agents.length > 0 + ? `Agents: ${trajectory.agents + .map((agent) => `${agent.name} (${agent.role})`) + .join(", ")}\n` + : "Agents: none recorded\n"; + + const chapters = trajectory.chapters.map(renderChapter); + const decisions = renderDecisions(trajectory); + const findings = renderFindings(trajectory); + const retrospective = renderRetrospective(trajectory.retrospective); + const filesAndCommits = [ + `Files changed: ${formatList(trajectory.filesChanged)}`, + `Commits: ${formatList(trajectory.commits)}`, + ] + .join("\n") + .concat("\n"); + + return { + header, + agents, + chapters, + decisions, + findings, + retrospective, + filesAndCommits, + }; +} + +function renderChapter(chapter: Chapter): string { + const lines = chapter.events + .filter(shouldIncludeEvent) + .map((event) => formatEvent(event)); + + const chapterBody = + lines.length > 0 + ? lines.map((line) => `- ${line}`).join("\n") + : "- No medium/high/critical events captured"; + + return [ + `### Chapter: ${chapter.title}`, + `Agent: ${chapter.agentName}`, + `Window: ${chapter.startedAt} -> ${chapter.endedAt ?? "ongoing"}`, + chapterBody, + ] + .join("\n") + .concat("\n"); +} + +function renderDecisions(trajectory: Trajectory): string { + const seen = new Set(); + const decisions: Decision[] = []; + + for (const chapter of trajectory.chapters) { + for (const event of chapter.events) { + if (event.type !== "decision") { + continue; + } + + const decision = asDecision(event.raw); + if (!decision) { + continue; + } + + const key = `${decision.question}\n${decision.chosen}\n${decision.reasoning}`; + if (!seen.has(key)) { + seen.add(key); + decisions.push(decision); + } + } + } + + for (const decision of trajectory.retrospective?.decisions ?? []) { + const key = `${decision.question}\n${decision.chosen}\n${decision.reasoning}`; + if (!seen.has(key)) { + seen.add(key); + decisions.push(decision); + } + } + + if (decisions.length === 0) { + return "Decisions:\n- None recorded\n"; + } + + return [ + "Decisions:", + ...decisions.map((decision) => + [ + `- Question: ${decision.question}`, + ` Chosen: ${decision.chosen}`, + ` Reasoning: ${decision.reasoning}`, + ].join("\n"), + ), + ] + .join("\n") + .concat("\n"); +} + +function renderFindings(trajectory: Trajectory): string { + const findings = trajectory.chapters.flatMap((chapter) => + chapter.events + .filter((event) => event.type === "finding") + .map((event) => asFinding(event.raw, event.content)), + ); + + if (findings.length === 0) { + return "Findings:\n- None recorded\n"; + } + + return [ + "Findings:", + ...findings.map((finding) => + [ + `- What: ${finding.what}`, + ` Where: ${finding.where}`, + ` Significance: ${finding.significance}`, + ].join("\n"), + ), + ] + .join("\n") + .concat("\n"); +} + +function renderRetrospective(retrospective?: Retrospective): string { + if (!retrospective) { + return "Retrospective:\n- None recorded\n"; + } + + const lines = [ + "Retrospective:", + `- Summary: ${retrospective.summary}`, + ` Approach: ${retrospective.approach}`, + ]; + + if (retrospective.challenges && retrospective.challenges.length > 0) { + lines.push(` Challenges: ${retrospective.challenges.join("; ")}`); + } + + if (retrospective.learnings && retrospective.learnings.length > 0) { + lines.push(` Learnings: ${retrospective.learnings.join("; ")}`); + } + + if (retrospective.suggestions && retrospective.suggestions.length > 0) { + lines.push(` Suggestions: ${retrospective.suggestions.join("; ")}`); + } + + if (retrospective.timeSpent) { + lines.push(` Time spent: ${retrospective.timeSpent}`); + } + + return lines.join("\n").concat("\n"); +} + +function shouldIncludeEvent(event: TrajectoryEvent): boolean { + if (event.type === "tool_call" || event.type === "tool_result") { + return false; + } + + return INCLUDED_SIGNIFICANCE.has(resolveSignificance(event)); +} + +function resolveSignificance(event: TrajectoryEvent): EventSignificance { + if (event.significance) { + return event.significance; + } + + switch (event.type) { + case "decision": + case "finding": + case "error": + return "high"; + case "reflection": + case "note": + case "message_sent": + case "message_received": + return "medium"; + default: + return "low"; + } +} + +function formatEvent(event: TrajectoryEvent): string { + if (event.type === "decision") { + const decision = asDecision(event.raw); + if (decision) { + return `[decision/${resolveSignificance(event)}] ${decision.question} -> ${decision.chosen}`; + } + } + + if (event.type === "finding") { + const finding = asFinding(event.raw, event.content); + return `[finding/${resolveSignificance(event)}] ${finding.what} @ ${finding.where}`; + } + + return `[${event.type}/${resolveSignificance(event)}] ${event.content}`; +} + +function asDecision(raw: unknown): Decision | null { + if (!raw || typeof raw !== "object") { + return null; + } + + const candidate = raw as Partial; + if ( + typeof candidate.question !== "string" || + typeof candidate.chosen !== "string" || + typeof candidate.reasoning !== "string" + ) { + return null; + } + + return { + question: candidate.question, + chosen: candidate.chosen, + reasoning: candidate.reasoning, + alternatives: Array.isArray(candidate.alternatives) + ? candidate.alternatives + : [], + confidence: candidate.confidence, + }; +} + +function asFinding(raw: unknown, fallbackContent: string): Finding { + if (!raw || typeof raw !== "object") { + return { + what: fallbackContent, + where: "unknown", + significance: "Not structured", + category: "other", + }; + } + + const candidate = raw as Partial; + return { + what: + typeof candidate.what === "string" && candidate.what.trim().length > 0 + ? candidate.what + : fallbackContent, + where: + typeof candidate.where === "string" && candidate.where.trim().length > 0 + ? candidate.where + : "unknown", + significance: + typeof candidate.significance === "string" && + candidate.significance.trim().length > 0 + ? candidate.significance + : "Not structured", + category: candidate.category ?? "other", + suggestedAction: + typeof candidate.suggestedAction === "string" + ? candidate.suggestedAction + : undefined, + confidence: candidate.confidence, + }; +} + +function truncateChapters( + chapters: string[], + totalChapterChars: number, + ratio: number, +): string[] { + if (ratio >= 1 || totalChapterChars === 0) { + return chapters; + } + + let remaining = Math.floor(totalChapterChars * ratio); + + return chapters.map((chapter, index) => { + if (remaining <= 0) { + return "### Chapter: Truncated\n- Omitted due to token budget\n"; + } + + const proportionalTarget = + index === chapters.length - 1 + ? remaining + : Math.floor(chapter.length * ratio); + const allowance = Math.max(0, Math.min(chapter.length, proportionalTarget)); + remaining -= allowance; + + return truncateText(chapter, allowance); + }); +} + +function joinSessions(sessions: SessionRender[]): string { + return sessions + .map((session) => + [ + session.header, + session.agents, + ...session.chapters, + session.decisions, + session.findings, + session.retrospective, + session.filesAndCommits, + ] + .filter(Boolean) + .join("\n") + .trim(), + ) + .join("\n\n"); +} + +function truncateText(text: string, maxChars: number): string { + if (maxChars <= 0) { + return ""; + } + + if (text.length <= maxChars) { + return text; + } + + if (maxChars <= 16) { + return text.slice(0, maxChars); + } + + return `${text.slice(0, maxChars - 16).trimEnd()}\n[truncated]\n`; +} + +function formatList(values: string[]): string { + return values.length > 0 ? values.join(", ") : "none"; +} + +function formatDuration(startedAt: string, completedAt?: string): string { + const start = new Date(startedAt).getTime(); + const end = new Date(completedAt ?? startedAt).getTime(); + const elapsedMs = Math.max(0, end - start); + const minutes = Math.floor(elapsedMs / 60000); + const hours = Math.floor(minutes / 60); + const remainingMinutes = minutes % 60; + + if (hours > 0 && remainingMinutes > 0) { + return `${hours}h ${remainingMinutes}m`; + } + + if (hours > 0) { + return `${hours}h`; + } + + if (minutes > 0) { + return `${minutes}m`; + } + + return completedAt ? "0m" : "ongoing"; +} diff --git a/src/core/schema.ts b/src/core/schema.ts index 7fe45e1..eae1c7d 100644 --- a/src/core/schema.ts +++ b/src/core/schema.ts @@ -49,21 +49,20 @@ export const TrajectoryStatusSchema = z.enum([ "abandoned", ]); -/** - * Event type schema - */ -export const TrajectoryEventTypeSchema = z.enum([ - "prompt", - "thinking", - "tool_call", - "tool_result", - "message_sent", - "message_received", - "decision", - "finding", - "reflection", - "note", - "error", +/** Permissive on read so trajectories from other tools can load even with unknown event types. */ +export const TrajectoryEventTypeSchema = z.union([ + z.literal("prompt"), + z.literal("thinking"), + z.literal("tool_call"), + z.literal("tool_result"), + z.literal("message_sent"), + z.literal("message_received"), + z.literal("decision"), + z.literal("finding"), + z.literal("reflection"), + z.literal("note"), + z.literal("error"), + z.string(), // Allow event types emitted by other tools (e.g. agent-relay's completion-evidence / completion-marker). Downstream code filters to known types. ]); /** @@ -244,6 +243,7 @@ export const TrajectorySchema = z.object({ commits: z.array(z.string()), filesChanged: z.array(z.string()), projectId: z.string(), + workflowId: z.string().optional(), tags: z.array(z.string()), _trace: TrajectoryTraceRefSchema.optional(), }); diff --git a/src/core/types.ts b/src/core/types.ts index 7845da4..3e60150 100644 --- a/src/core/types.ts +++ b/src/core/types.ts @@ -225,6 +225,8 @@ export interface Trajectory { filesChanged: string[]; /** Project identifier */ projectId: string; + /** Opaque id set by the workflow runner via TRAJECTORIES_WORKFLOW_ID env var. Lets trail compact --workflow collate all trajectories from a single workflow run. */ + workflowId?: string; /** User-defined tags */ tags: string[]; /** Trace information for code attribution */ @@ -257,6 +259,8 @@ export interface CreateTrajectoryInput { source?: TaskSource; /** Optional project ID (defaults to cwd) */ projectId?: string; + /** Opaque id set by the workflow runner via TRAJECTORIES_WORKFLOW_ID env var. Lets trail compact --workflow collate all trajectories from a single workflow run. */ + workflowId?: string; /** Optional initial tags */ tags?: string[]; } diff --git a/src/sdk/client.ts b/src/sdk/client.ts index ba50d32..14b7be4 100644 --- a/src/sdk/client.ts +++ b/src/sdk/client.ts @@ -5,6 +5,10 @@ * Provides a clean, developer-friendly API with automatic storage management. */ +import { spawn } from "node:child_process"; +import { existsSync, readFileSync } from "node:fs"; +import { createRequire } from "node:module"; +import { dirname, resolve as resolvePath } from "node:path"; import { TrajectoryError, abandonTrajectory, @@ -31,6 +35,177 @@ import { exportToTimeline } from "../export/timeline.js"; import { FileStorage } from "../storage/file.js"; import type { StorageAdapter } from "../storage/interface.js"; +const require = createRequire(import.meta.url); + +interface TrajectoryCliPackage { + name?: string; + bin?: string | Record; +} + +interface TrajectoryCliInvocation { + command: string; + args: string[]; +} + +function normalizeOptionalString(value?: string): string | undefined { + const normalized = value?.trim(); + return normalized ? normalized : undefined; +} + +function normalizeAutoCompactOptions( + autoCompact?: boolean | { mechanical?: boolean; markdown?: boolean }, +): false | { mechanical: boolean; markdown: boolean } { + if (!autoCompact) { + return false; + } + + if (autoCompact === true) { + return { mechanical: false, markdown: true }; + } + + return { + mechanical: autoCompact.mechanical ?? false, + markdown: autoCompact.markdown ?? true, + }; +} + +function resolveStartWorkflowId( + options?: Omit, +): string | undefined { + if ( + options !== undefined && + Object.prototype.hasOwnProperty.call(options, "workflowId") + ) { + return normalizeOptionalString(options.workflowId); + } + + return normalizeOptionalString(process.env.TRAJECTORIES_WORKFLOW_ID); +} + +function resolveTrajectoryCliInvocation(): TrajectoryCliInvocation { + const explicitCli = normalizeOptionalString(process.env.TRAJECTORIES_CLI); + if (explicitCli) { + if (/\.(?:cjs|mjs|js)$/i.test(explicitCli)) { + return { command: process.execPath, args: [explicitCli] }; + } + return { command: explicitCli, args: [] }; + } + + try { + const packageJsonPath = require.resolve("agent-trajectories/package.json"); + const pkg = JSON.parse( + readFileSync(packageJsonPath, "utf-8"), + ) as TrajectoryCliPackage; + const binEntry = + typeof pkg.bin === "string" + ? pkg.bin + : (pkg.bin?.trail ?? (pkg.name ? pkg.bin?.[pkg.name] : undefined)); + + if (binEntry) { + const cliPath = resolvePath(dirname(packageJsonPath), binEntry); + if (existsSync(cliPath)) { + return { command: process.execPath, args: [cliPath] }; + } + } + } catch { + // Fall back to the CLI on PATH when package resolution is unavailable. + } + + return { command: "trail", args: [] }; +} + +function parseCompactWorkflowOutput(stdout: string): { + compactedPath: string; + markdownPath?: string; +} { + const compactedPath = stdout.match( + /^\s*Compacted trajectory saved to:\s*(.+)$/m, + )?.[1]; + const markdownPath = stdout.match( + /^\s*Markdown summary saved to:\s*(.+)$/m, + )?.[1]; + + if (!compactedPath) { + throw new Error("compactWorkflow failed: unable to parse compacted path"); + } + + return { + compactedPath: compactedPath.trim(), + ...(markdownPath ? { markdownPath: markdownPath.trim() } : {}), + }; +} + +export async function compactWorkflow( + workflowId: string, + options?: { markdown?: boolean; mechanical?: boolean; cwd?: string }, +): Promise<{ compactedPath: string; markdownPath?: string }> { + const normalizedWorkflowId = normalizeOptionalString(workflowId); + if (!normalizedWorkflowId) { + throw new Error("compactWorkflow failed: workflowId is required"); + } + + const cli = resolveTrajectoryCliInvocation(); + const args = [ + ...cli.args, + "compact", + "--workflow", + normalizedWorkflowId, + "--all", + ]; + + if (options?.markdown) { + args.push("--markdown"); + } + if (options?.mechanical) { + args.push("--mechanical"); + } + + return new Promise((resolve, reject) => { + const child = spawn(cli.command, args, { + cwd: options?.cwd, + stdio: ["ignore", "pipe", "pipe"], + }); + + const stdoutChunks: Buffer[] = []; + let stderr = ""; + + child.stdout.on("data", (chunk: Buffer) => { + stdoutChunks.push(Buffer.from(chunk)); + }); + + child.stderr.on("data", (chunk: Buffer) => { + stderr += chunk.toString(); + process.stderr.write(chunk); + }); + + child.on("error", (error) => { + reject(new Error(`compactWorkflow failed: ${error.message}`)); + }); + + child.on("close", (code) => { + if (code !== 0) { + reject( + new Error( + `compactWorkflow failed: ${stderr.trim() || `exit code ${code}`}`, + ), + ); + return; + } + + try { + const stdout = Buffer.concat(stdoutChunks).toString("utf-8"); + resolve(parseCompactWorkflowOutput(stdout)); + } catch (error) { + reject( + error instanceof Error + ? error + : new Error("compactWorkflow failed: unable to parse CLI output"), + ); + } + }); + }); +} + /** * Options for configuring the TrajectoryClient */ @@ -45,6 +220,10 @@ export interface TrajectoryClientOptions { projectId?: string; /** Whether to auto-save after each operation. Defaults to true */ autoSave?: boolean; + /** + * When set, session.complete() and session.done() automatically run compactWorkflow() against the trajectory's workflowId. Default false. Pass an object to control the flags passed to the CLI — e.g. { mechanical: true } skips the LLM for deterministic compaction, { markdown: false } skips the .md companion. + */ + autoCompact?: boolean | { mechanical?: boolean; markdown?: boolean }; } /** @@ -79,6 +258,25 @@ export class TrajectorySession { return this.trajectory.id; } + private async autoCompactIfConfigured(): Promise { + const autoCompact = this.client.getAutoCompactOptions(); + if (!autoCompact || !this.trajectory.workflowId) { + return; + } + + try { + await compactWorkflow(this.trajectory.workflowId, { + ...autoCompact, + cwd: this.client.getAutoCompactCwd(), + }); + } catch (error) { + const message = error instanceof Error ? error.message : String(error); + console.error( + `Warning: autoCompact failed for workflow ${this.trajectory.workflowId}: ${message}`, + ); + } + } + /** * Start a new chapter * @param title - Chapter title @@ -221,6 +419,7 @@ export class TrajectorySession { async complete(input: CompleteTrajectoryInput): Promise { this.trajectory = completeTrajectory(this.trajectory, input); await this.client.save(this.trajectory); + await this.autoCompactIfConfigured(); return this.trajectory; } @@ -321,12 +520,26 @@ export class TrajectoryClient { readonly defaultAgent?: string; private projectId?: string; private autoSave: boolean; + private readonly autoCompactCwd?: string; + private readonly autoCompact: + | false + | { mechanical: boolean; markdown: boolean }; constructor(options: TrajectoryClientOptions = {}) { this.storage = options.storage ?? new FileStorage(options.dataDir); this.defaultAgent = options.defaultAgent ?? process.env.TRAJECTORIES_AGENT; this.projectId = options.projectId ?? process.env.TRAJECTORIES_PROJECT; this.autoSave = options.autoSave ?? true; + this.autoCompact = normalizeAutoCompactOptions(options.autoCompact); + this.autoCompactCwd = options.storage ? undefined : options.dataDir; + } + + getAutoCompactOptions(): false | { mechanical: boolean; markdown: boolean } { + return this.autoCompact; + } + + getAutoCompactCwd(): string | undefined { + return this.autoCompactCwd; } /** @@ -375,14 +588,21 @@ export class TrajectoryClient { ); } + const workflowId = resolveStartWorkflowId(options); + const { workflowId: _workflowId, ...createOptions } = options ?? {}; + const trajectory = createTrajectory({ title, projectId: this.projectId, - ...options, + ...createOptions, }); - await this.storage.save(trajectory); - return new TrajectorySession(trajectory, this, this.autoSave); + const stampedTrajectory = workflowId + ? { ...trajectory, workflowId } + : trajectory; + + await this.storage.save(stampedTrajectory); + return new TrajectorySession(stampedTrajectory, this, this.autoSave); } /** diff --git a/src/sdk/index.ts b/src/sdk/index.ts index 7d8d37c..7c2c382 100644 --- a/src/sdk/index.ts +++ b/src/sdk/index.ts @@ -31,7 +31,11 @@ */ // Client (with storage) -export { TrajectoryClient, TrajectorySession } from "./client.js"; +export { + TrajectoryClient, + TrajectorySession, + compactWorkflow, +} from "./client.js"; export type { TrajectoryClientOptions } from "./client.js"; // Builder (in-memory) diff --git a/tests/compact/llm-compact.test.ts b/tests/compact/llm-compact.test.ts new file mode 100644 index 0000000..f19bb11 --- /dev/null +++ b/tests/compact/llm-compact.test.ts @@ -0,0 +1,465 @@ +import { mkdtemp, readFile, readdir, rm } from "node:fs/promises"; +import { tmpdir } from "node:os"; +import { join } from "node:path"; +import { afterEach, beforeEach, describe, expect, it, vi } from "vitest"; +import { runCommand } from "../../src/cli/runner.js"; +import { generateCompactionMarkdown } from "../../src/compact/markdown.js"; +import { + mergeCompactionWithMetadata, + parseCompactionResponse, +} from "../../src/compact/parser.js"; +import { buildCompactionPrompt } from "../../src/compact/prompts.js"; +import type { Message as PromptMessage } from "../../src/compact/prompts.js"; +import { + CLIProvider, + type CompactionLLM, + type CompletionOptions, + resolveProvider, +} from "../../src/compact/provider.js"; +import { serializeForLLM } from "../../src/compact/serializer.js"; +import type { Decision, Trajectory } from "../../src/core/types.js"; + +describe("LLM compaction", () => { + let tempDir: string; + let originalCwd: string; + let originalEnv: Record; + + beforeEach(async () => { + tempDir = await mkdtemp(join(tmpdir(), "trail-llm-compact-")); + originalCwd = process.cwd(); + process.chdir(tempDir); + + originalEnv = { + OPENAI_API_KEY: process.env.OPENAI_API_KEY, + ANTHROPIC_API_KEY: process.env.ANTHROPIC_API_KEY, + TRAJECTORIES_LLM_PROVIDER: process.env.TRAJECTORIES_LLM_PROVIDER, + TRAJECTORIES_LLM_MODEL: process.env.TRAJECTORIES_LLM_MODEL, + TRAJECTORIES_LLM_MAX_INPUT_TOKENS: + process.env.TRAJECTORIES_LLM_MAX_INPUT_TOKENS, + TRAJECTORIES_LLM_MAX_OUTPUT_TOKENS: + process.env.TRAJECTORIES_LLM_MAX_OUTPUT_TOKENS, + TRAJECTORIES_LLM_TEMPERATURE: process.env.TRAJECTORIES_LLM_TEMPERATURE, + }; + + clearCompactionEnv(); + }); + + afterEach(async () => { + process.chdir(originalCwd); + restoreEnv(originalEnv); + await rm(tempDir, { recursive: true, force: true }); + }); + + it("serializes trajectories for LLM compaction", () => { + const serialized = serializeForLLM([createTrajectory()], 4000); + + expect(serialized).toContain("## Session: Update compact command"); + expect(serialized).toContain("Agents: Worker (lead)"); + expect(serialized).toContain("Question: Should compact use LLM summaries?"); + expect(serialized).toContain( + "Files changed: src/cli/commands/compact.ts, src/compact/provider.ts", + ); + }); + + it("parses structured LLM output", () => { + const parsed = parseCompactionResponse(`\`\`\`json +{ + "narrative": "LLM compaction now synthesizes the completed sessions into a concise technical summary.", + "decisions": [ + { + "question": "How should compact choose its strategy?", + "chosen": "Prefer LLM compaction when a provider is available.", + "reasoning": "It produces a denser summary while mechanical data still preserves files and commits.", + "impact": "CLI output becomes more useful after merges." + } + ], + "conventions": [ + { + "pattern": "Keep mechanical metadata even when LLM output is used.", + "rationale": "Files, commits, and agents are deterministic and should not rely on model output.", + "scope": "compact command" + } + ], + "lessons": [ + { + "lesson": "Dry runs should show the full prompt and token estimate.", + "context": "LLM calls can be expensive and hard to debug without visibility.", + "recommendation": "Print the constructed messages before invoking the provider." + } + ], + "openQuestions": [ + "Should the command persist raw model responses for debugging?" + ] +} +\`\`\``); + + expect(parsed.narrative).toContain("LLM compaction now synthesizes"); + expect(parsed.decisions).toHaveLength(1); + expect(parsed.decisions[0]?.impact).toContain("CLI output becomes"); + expect(parsed.conventions[0]?.pattern).toContain( + "Keep mechanical metadata", + ); + expect(parsed.lessons[0]?.recommendation).toContain("constructed messages"); + expect(parsed.openQuestions).toEqual([ + "Should the command persist raw model responses for debugging?", + ]); + }); + + it("generates markdown from LLM compaction output", () => { + const markdown = generateCompactionMarkdown({ + id: "compact_123", + version: 1, + type: "compacted", + compactedAt: "2026-03-28T12:00:00.000Z", + sourceTrajectories: ["traj_1", "traj_2"], + dateRange: { + start: "2026-03-20T10:00:00.000Z", + end: "2026-03-28T12:00:00.000Z", + }, + summary: { + totalDecisions: 3, + totalEvents: 14, + uniqueAgents: ["Worker", "Reviewer"], + }, + filesAffected: ["src/cli/commands/compact.ts", "src/compact/config.ts"], + commits: ["abc1234", "def5678"], + narrative: + "The command now prefers LLM compaction when a provider is available.", + decisions: [ + { + question: "How should compact choose the summary strategy?", + chosen: + "Auto-detect an LLM provider unless mechanical mode is forced.", + reasoning: + "This preserves the old flow but upgrades the default path.", + impact: "The command can produce higher-signal summaries by default.", + }, + ], + conventions: [ + { + pattern: "Always keep files and commits from the mechanical pass.", + rationale: "That data is deterministic and cheap to compute.", + scope: "LLM compaction output", + }, + ], + lessons: [ + { + lesson: "Token estimates are required for dry runs.", + context: "LLM compaction can be expensive.", + recommendation: "Print the estimate before calling the provider.", + }, + ], + openQuestions: ["Should config support per-project prompt templates?"], + }); + + expect(markdown).toContain("# Trajectory Compaction:"); + expect(markdown).toContain("## Key Decisions (1)"); + expect(markdown).toContain("| Question | Decision | Impact |"); + expect(markdown).toContain("## Conventions Established"); + expect(markdown).toContain("## Lessons Learned"); + }); + + it("runs the full LLM compaction pipeline with a mocked provider", () => { + const stubbedResponse = JSON.stringify({ + narrative: "Sessions focused on adding LLM-backed compaction.", + decisions: [ + { + question: "How to integrate LLM output?", + chosen: "Merge with mechanical metadata.", + reasoning: "Keeps deterministic data intact.", + impact: "Reliable file and commit lists.", + }, + ], + conventions: [ + { + pattern: "Always retain mechanical metadata.", + rationale: "It is deterministic.", + scope: "compact command", + }, + ], + lessons: [ + { + lesson: "Token budgeting prevents context overflow.", + context: "Large trajectories exceed model limits.", + recommendation: "Truncate chapters proportionally.", + }, + ], + openQuestions: ["Should raw model responses be persisted?"], + }); + + const mockProvider: CompactionLLM = { + complete: async ( + _messages: PromptMessage[], + _options?: CompletionOptions, + ): Promise => stubbedResponse, + }; + + const trajectory = createTrajectory(); + const serialized = serializeForLLM([trajectory], 4000); + const messages = buildCompactionPrompt(serialized); + + // Verify the prompt was built with user + system messages + expect(messages.length).toBeGreaterThanOrEqual(2); + expect(messages[0]?.role).toBe("system"); + + // Run the mocked provider + return mockProvider.complete(messages, { jsonMode: true }).then((raw) => { + const parsed = parseCompactionResponse(raw); + + expect(parsed.narrative).toContain("LLM-backed compaction"); + expect(parsed.decisions).toHaveLength(1); + expect(parsed.conventions).toHaveLength(1); + expect(parsed.lessons).toHaveLength(1); + expect(parsed.openQuestions).toHaveLength(1); + + // Merge with metadata + const merged = mergeCompactionWithMetadata( + { + id: "compact_mock", + version: 1, + type: "compacted", + compactedAt: new Date().toISOString(), + sourceTrajectories: [trajectory.id], + dateRange: { + start: trajectory.startedAt, + end: trajectory.completedAt ?? trajectory.startedAt, + }, + summary: { + totalDecisions: 1, + totalEvents: 2, + uniqueAgents: ["Worker"], + }, + filesAffected: trajectory.filesChanged ?? [], + commits: trajectory.commits ?? [], + }, + parsed, + ); + + expect(merged.id).toBe("compact_mock"); + expect(merged.narrative).toContain("LLM-backed compaction"); + expect(merged.filesAffected).toContain("src/cli/commands/compact.ts"); + + // Verify markdown generation works end-to-end + const md = generateCompactionMarkdown(merged); + expect(md).toContain("## Summary"); + expect(md).toContain("LLM-backed compaction"); + }); + }); + + it( + "uses mechanical compaction with --mechanical flag", + { timeout: 15_000 }, + async () => { + const started = await runCommand(["start", "Update compact command"]); + expect(started.success).toBe(true); + + const decided = await runCommand([ + "decision", + "Use LLM compaction when available", + "--reasoning", + "It produces denser summaries while keeping mechanical metadata.", + ]); + expect(decided.success).toBe(true); + + const completed = await runCommand([ + "complete", + "--summary", + "Finished LLM compaction flow", + "--confidence", + "0.91", + ]); + expect(completed.success).toBe(true); + + const result = await runCommand(["compact", "--mechanical"]); + + expect(result.success).toBe(true); + expect(result.output).toContain("Compacted trajectory saved to:"); + + const compactedDir = join(tempDir, ".trajectories", "compacted"); + const compactedFiles = await readdir(compactedDir); + const jsonFile = compactedFiles.find((file) => file.endsWith(".json")); + const markdownFile = compactedFiles.find((file) => file.endsWith(".md")); + + expect(jsonFile).toBeDefined(); + expect(markdownFile).toBeDefined(); + + const compacted = JSON.parse( + await readFile(join(compactedDir, jsonFile ?? ""), "utf-8"), + ) as { + filesAffected?: string[]; + decisionGroups?: unknown[]; + narrative?: string; + }; + + expect(compacted.narrative).toBeUndefined(); + expect(compacted.decisionGroups).toBeDefined(); + expect(compacted.filesAffected).toBeDefined(); + }, + ); +}); + +describe("CLI provider resolution", () => { + let originalEnv: Record; + + beforeEach(() => { + originalEnv = { + OPENAI_API_KEY: process.env.OPENAI_API_KEY, + ANTHROPIC_API_KEY: process.env.ANTHROPIC_API_KEY, + TRAJECTORIES_LLM_PROVIDER: process.env.TRAJECTORIES_LLM_PROVIDER, + }; + clearCompactionEnv(); + }); + + afterEach(() => { + restoreEnv(originalEnv); + vi.restoreAllMocks(); + }); + + it("prefers CLI over API providers in auto mode even when API keys are present", async () => { + // Auto mode prefers local CLIs (claude / codex / gemini / opencode) so + // users never need to set an API key by default. API providers are only + // used on explicit opt-in via TRAJECTORIES_LLM_PROVIDER=openai|anthropic. + process.env.OPENAI_API_KEY = "sk-test"; + const provider = await resolveProvider({}); + expect(provider).not.toBeNull(); + // When a supported CLI is installed, auto mode selects it. When no CLI + // is found, auto falls back to the API provider — that path is covered + // by a separate test below. + if (provider instanceof CLIProvider) { + expect(provider).toBeInstanceOf(CLIProvider); + } + }); + + it("respects explicit TRAJECTORIES_LLM_PROVIDER=openai even when a CLI is installed", async () => { + process.env.OPENAI_API_KEY = "sk-test"; + process.env.TRAJECTORIES_LLM_PROVIDER = "openai"; + const provider = await resolveProvider({}); + expect(provider).not.toBeNull(); + expect(provider).not.toBeInstanceOf(CLIProvider); + }); + + it("falls back to CLI provider when no API keys are set", async () => { + const provider = await resolveProvider({}); + // Will be CLIProvider if claude/codex is installed, null otherwise + if (provider !== null) { + expect(provider).toBeInstanceOf(CLIProvider); + } + }); + + it("returns CLI provider when explicit provider is 'cli'", async () => { + const provider = await resolveProvider({ provider: "cli" }); + // Will be CLIProvider if a supported CLI is installed, null otherwise + if (provider !== null) { + expect(provider).toBeInstanceOf(CLIProvider); + } + }); + + it("CLIProvider exposes the cli name", () => { + const provider = new CLIProvider("claude", "/usr/local/bin/claude"); + expect(provider.cliName).toBe("claude"); + }); +}); + +function createTrajectory(id = "traj_compact_llm"): Trajectory { + const startedAt = "2026-03-20T10:00:00.000Z"; + const completedAt = "2026-03-20T11:15:00.000Z"; + const decision: Decision = { + question: "Should compact use LLM summaries?", + chosen: "Use LLM output when a provider is configured.", + reasoning: + "It captures denser technical patterns while keeping deterministic metadata from the mechanical pass.", + alternatives: [{ option: "Use only mechanical summaries" }], + confidence: 0.86, + }; + + return { + id, + version: 1, + task: { + title: "Update compact command", + description: + "Add LLM-backed compaction with prompt preview and markdown output.", + }, + status: "completed", + startedAt, + completedAt, + agents: [ + { + name: "Worker", + role: "lead", + joinedAt: startedAt, + }, + ], + chapters: [ + { + id: "chapter_1", + title: "Implementation", + agentName: "Worker", + startedAt, + endedAt: completedAt, + events: [ + { + ts: new Date(startedAt).getTime(), + type: "decision", + content: "Switch compact to an LLM-first flow", + raw: decision, + significance: "high", + }, + { + ts: new Date("2026-03-20T10:30:00.000Z").getTime(), + type: "finding", + content: + "Existing mechanical output still provides accurate files and commits.", + raw: { + what: "Mechanical compaction already computes deterministic metadata.", + where: "src/cli/commands/compact.ts", + significance: "Useful for merge step", + category: "pattern", + }, + significance: "high", + }, + ], + }, + ], + retrospective: { + summary: "The command now supports LLM and mechanical compaction paths.", + approach: + "Build the prompt from serialized trajectories, then merge parsed output with deterministic metadata.", + decisions: [decision], + learnings: ["Keep artifact writing shared across both paths."], + suggestions: [ + "Add a mocked provider test later if CLI coverage expands.", + ], + confidence: 0.88, + timeSpent: "1h 15m", + }, + commits: ["abc1234"], + filesChanged: ["src/cli/commands/compact.ts", "src/compact/provider.ts"], + projectId: "test-project", + tags: ["compact", "llm"], + }; +} + +function clearCompactionEnv(): void { + for (const key of [ + "OPENAI_API_KEY", + "ANTHROPIC_API_KEY", + "TRAJECTORIES_LLM_PROVIDER", + "TRAJECTORIES_LLM_MODEL", + "TRAJECTORIES_LLM_MAX_INPUT_TOKENS", + "TRAJECTORIES_LLM_MAX_OUTPUT_TOKENS", + "TRAJECTORIES_LLM_TEMPERATURE", + ]) { + delete process.env[key]; + } +} + +function restoreEnv(values: Record): void { + for (const [key, value] of Object.entries(values)) { + if (value === undefined) { + delete process.env[key]; + } else { + process.env[key] = value; + } + } +} diff --git a/tests/sdk/workflow-compact.test.ts b/tests/sdk/workflow-compact.test.ts new file mode 100644 index 0000000..c9d8e78 --- /dev/null +++ b/tests/sdk/workflow-compact.test.ts @@ -0,0 +1,561 @@ +import { spawnSync } from "node:child_process"; +import { existsSync, mkdirSync, readdirSync, writeFileSync } from "node:fs"; +import { mkdtemp, readFile, realpath, rm } from "node:fs/promises"; +import { tmpdir } from "node:os"; +import { dirname, join, resolve as resolvePath } from "node:path"; +import { fileURLToPath } from "node:url"; +import { afterEach, beforeEach, describe, expect, it } from "vitest"; + +const __filename = fileURLToPath(import.meta.url); +const __dirname = dirname(__filename); +const repoRoot = resolvePath(__dirname, "../.."); +const cliSourceEntry = join(repoRoot, "src/cli/index.ts"); +const cliDistEntry = join(repoRoot, "dist/cli/index.js"); + +interface EnvSnapshot { + TRAJECTORIES_WORKFLOW_ID?: string; + TRAJECTORIES_DATA_DIR?: string; + TRAJECTORIES_SEARCH_PATHS?: string; + TRAJECTORIES_CLI?: string; + TRAJECTORIES_PROJECT?: string; +} + +function snapshotEnv(): EnvSnapshot { + return { + TRAJECTORIES_WORKFLOW_ID: process.env.TRAJECTORIES_WORKFLOW_ID, + TRAJECTORIES_DATA_DIR: process.env.TRAJECTORIES_DATA_DIR, + TRAJECTORIES_SEARCH_PATHS: process.env.TRAJECTORIES_SEARCH_PATHS, + TRAJECTORIES_CLI: process.env.TRAJECTORIES_CLI, + TRAJECTORIES_PROJECT: process.env.TRAJECTORIES_PROJECT, + }; +} + +function clearEnv(key: string): void { + Reflect.deleteProperty(process.env, key); +} + +function restoreEnv(snapshot: EnvSnapshot): void { + for (const key of Object.keys(snapshot) as (keyof EnvSnapshot)[]) { + const value = snapshot[key]; + if (value === undefined) { + clearEnv(key); + } else { + process.env[key] = value; + } + } +} + +interface CompactCliInvocationRecord { + argv: string[]; + cwd: string; +} + +function writeRecordingCompactCli( + wrapperPath: string, + realCliPath: string, + recordPath: string, +): void { + writeFileSync( + wrapperPath, + `import { spawnSync } from "node:child_process"; +import { mkdirSync, writeFileSync } from "node:fs"; +import { dirname } from "node:path"; + +const recordPath = ${JSON.stringify(recordPath)}; +const realCliPath = ${JSON.stringify(realCliPath)}; + +mkdirSync(dirname(recordPath), { recursive: true }); +writeFileSync( + recordPath, + JSON.stringify( + { + argv: process.argv.slice(2), + cwd: process.cwd(), + }, + null, + 2, + ), + "utf-8", +); + +const result = spawnSync(process.execPath, [realCliPath, ...process.argv.slice(2)], { + cwd: process.cwd(), + env: process.env, + stdio: "inherit", +}); + +if (result.error) { + throw result.error; +} + +process.exit(result.status ?? 1); +`, + "utf-8", + ); +} + +describe("workflow compaction", () => { + let tempDir: string; + let originalCwd: string; + let envSnapshot: EnvSnapshot; + + beforeEach(async () => { + tempDir = await mkdtemp(join(tmpdir(), "trail-wf-compact-")); + originalCwd = process.cwd(); + envSnapshot = snapshotEnv(); + // Ensure tests start with a clean trajectory env so storage paths + // resolve relative to the tmp cwd. + clearEnv("TRAJECTORIES_WORKFLOW_ID"); + clearEnv("TRAJECTORIES_DATA_DIR"); + clearEnv("TRAJECTORIES_SEARCH_PATHS"); + clearEnv("TRAJECTORIES_CLI"); + clearEnv("TRAJECTORIES_PROJECT"); + process.chdir(tempDir); + }); + + afterEach(async () => { + process.chdir(originalCwd); + restoreEnv(envSnapshot); + await rm(tempDir, { recursive: true, force: true }); + }); + + it("stamps workflowId from TRAJECTORIES_WORKFLOW_ID onto new trajectories", async () => { + const { TrajectoryClient } = await import("../../src/sdk/client.js"); + process.env.TRAJECTORIES_WORKFLOW_ID = "wf-env-stamp"; + + const client = new TrajectoryClient(); + await client.init(); + const session = await client.start("Env stamped task"); + + expect(session.data.workflowId).toBe("wf-env-stamp"); + + await client.close(); + }); + + it("trail start CLI stamps workflowId from TRAJECTORIES_WORKFLOW_ID env var", async () => { + const envForCli = { + ...process.env, + TRAJECTORIES_WORKFLOW_ID: "wf-cli-env", + }; + const startResult = spawnSync( + "npx", + ["tsx", cliSourceEntry, "start", "CLI env stamped task", "--quiet"], + { cwd: tempDir, encoding: "utf-8", env: envForCli }, + ); + expect(startResult.status).toBe(0); + const trajectoryId = startResult.stdout.trim(); + expect(trajectoryId).toMatch(/^traj_/); + + const activePath = join( + tempDir, + ".trajectories", + "active", + `${trajectoryId}.json`, + ); + expect(existsSync(activePath)).toBe(true); + const raw = JSON.parse(await readFile(activePath, "utf-8")) as { + workflowId?: string; + }; + expect(raw.workflowId).toBe("wf-cli-env"); + }); + + it("trail start CLI honors --workflow flag even when env var is unset", async () => { + const startResult = spawnSync( + "npx", + [ + "tsx", + cliSourceEntry, + "start", + "CLI flag stamped task", + "--workflow", + "wf-cli-flag", + "--quiet", + ], + { cwd: tempDir, encoding: "utf-8", env: process.env }, + ); + expect(startResult.status).toBe(0); + const trajectoryId = startResult.stdout.trim(); + const activePath = join( + tempDir, + ".trajectories", + "active", + `${trajectoryId}.json`, + ); + const raw = JSON.parse(await readFile(activePath, "utf-8")) as { + workflowId?: string; + }; + expect(raw.workflowId).toBe("wf-cli-flag"); + }); + + it("leaves workflowId undefined when TRAJECTORIES_WORKFLOW_ID is unset", async () => { + const { TrajectoryClient } = await import("../../src/sdk/client.js"); + expect(process.env.TRAJECTORIES_WORKFLOW_ID).toBeUndefined(); + + const client = new TrajectoryClient(); + await client.init(); + const session = await client.start("Unstamped task"); + + expect(session.data.workflowId).toBeUndefined(); + + await client.close(); + }); + + it("compacts only trajectories matching --workflow via the CLI", async () => { + const { TrajectoryClient } = await import("../../src/sdk/client.js"); + const client = new TrajectoryClient(); + await client.init(); + + process.env.TRAJECTORIES_WORKFLOW_ID = "wf-a"; + const taggedSession = await client.start("Tagged work"); + const taggedId = taggedSession.id; + await taggedSession.done("tagged done", 0.9); + + clearEnv("TRAJECTORIES_WORKFLOW_ID"); + const untaggedSession = await client.start("Untagged work"); + await untaggedSession.done("untagged done", 0.9); + + await client.close(); + + const spawnResult = spawnSync( + "npx", + [ + "tsx", + cliSourceEntry, + "compact", + "--workflow", + "wf-a", + "--mechanical", + "--all", + ], + { + cwd: tempDir, + encoding: "utf-8", + env: process.env, + }, + ); + + expect(spawnResult.status).toBe(0); + + const compactedPath = join( + tempDir, + ".trajectories/compacted/workflow-wf-a.json", + ); + expect(existsSync(compactedPath)).toBe(true); + + const compacted = JSON.parse(await readFile(compactedPath, "utf-8")) as { + sourceTrajectories: string[]; + workflowId?: string; + }; + + expect(compacted.sourceTrajectories).toEqual([taggedId]); + expect(compacted.workflowId).toBe("wf-a"); + }, 60_000); + + it("compactWorkflow SDK helper runs the CLI and returns the output paths", async () => { + // The SDK helper spawns the CLI via resolveTrajectoryCliInvocation(), + // which looks up `agent-trajectories/package.json`. That lookup is not + // guaranteed to succeed inside the repo under test, so point the helper + // at the built CLI explicitly. + if (!existsSync(cliDistEntry)) { + throw new Error( + `dist CLI missing at ${cliDistEntry}. Run \`npm run build\` before executing this test.`, + ); + } + process.env.TRAJECTORIES_CLI = cliDistEntry; + + const { TrajectoryClient, compactWorkflow } = await import( + "../../src/sdk/client.js" + ); + + const client = new TrajectoryClient(); + await client.init(); + + process.env.TRAJECTORIES_WORKFLOW_ID = "wf-a"; + const session = await client.start("SDK helper task"); + await session.decide("Which approach?", "Option A", "Cleaner abstraction"); + await session.done("sdk helper done", 0.85); + clearEnv("TRAJECTORIES_WORKFLOW_ID"); + + await client.close(); + + const result = await compactWorkflow("wf-a", { + mechanical: true, + markdown: true, + cwd: tempDir, + }); + + expect(result.compactedPath).toBeTruthy(); + expect(existsSync(result.compactedPath)).toBe(true); + + if (result.markdownPath) { + expect(existsSync(result.markdownPath)).toBe(true); + } + + const compacted = JSON.parse( + await readFile(result.compactedPath, "utf-8"), + ) as { workflowId?: string; sourceTrajectories: string[] }; + expect(compacted.workflowId).toBe("wf-a"); + expect(compacted.sourceTrajectories).toHaveLength(1); + }, 60_000); + + it("does not drop trajectories that contain unknown event types", async () => { + // Write a raw trajectory JSON with an event whose `type` is not one of + // the canonical TrajectoryEventType values. The schema must be lenient + // enough to still load the file so `trail compact` emits a compacted + // summary that includes it. + const dataDir = join(tempDir, ".trajectories"); + const monthDir = join(dataDir, "completed", "2026-04"); + mkdirSync(monthDir, { recursive: true }); + + const trajId = "traj_schemalenient01"; + const timestamp = "2026-04-12T10:00:00.000Z"; + const rawTrajectory = { + id: trajId, + version: 1, + task: { title: "Schema leniency" }, + status: "completed", + startedAt: timestamp, + completedAt: timestamp, + agents: [ + { + name: "tester", + role: "lead", + joinedAt: timestamp, + }, + ], + workflowId: "wf-lenient", + chapters: [ + { + id: "ch_lenient_1", + title: "Work", + agentName: "tester", + startedAt: timestamp, + events: [ + { + ts: Date.parse(timestamp), + type: "decision", + content: "Chose Option A", + raw: { + question: "Which option?", + chosen: "Option A", + alternatives: [], + reasoning: "Simpler", + }, + }, + { + ts: Date.parse(timestamp) + 1, + type: "completion-evidence", + content: "Evidence collected from external tool", + }, + ], + }, + ], + retrospective: { + summary: "All done", + approach: "Just did the thing", + confidence: 0.9, + }, + commits: [], + filesChanged: [], + projectId: tempDir, + tags: [], + }; + + const trajFilePath = join(monthDir, `${trajId}.json`); + writeFileSync(trajFilePath, JSON.stringify(rawTrajectory, null, 2)); + + // Index entry is how FileStorage.list() finds completed trajectories. + const indexPath = join(dataDir, "index.json"); + const index = { + version: 1, + lastUpdated: timestamp, + trajectories: { + [trajId]: { + title: "Schema leniency", + status: "completed", + startedAt: timestamp, + completedAt: timestamp, + path: trajFilePath, + }, + }, + }; + writeFileSync(indexPath, JSON.stringify(index, null, 2)); + + const spawnResult = spawnSync( + "npx", + [ + "tsx", + cliSourceEntry, + "compact", + "--workflow", + "wf-lenient", + "--mechanical", + "--all", + ], + { + cwd: tempDir, + encoding: "utf-8", + env: process.env, + }, + ); + + expect(spawnResult.status).toBe(0); + + const compactedPath = join( + tempDir, + ".trajectories/compacted/workflow-wf-lenient.json", + ); + expect(existsSync(compactedPath)).toBe(true); + + const compacted = JSON.parse(await readFile(compactedPath, "utf-8")) as { + sourceTrajectories: string[]; + }; + + // The presence of the unknown event type must NOT cause the trajectory + // to be silently dropped at load time. + expect(compacted.sourceTrajectories).toContain(trajId); + }, 60_000); + + describe("autoCompact option", () => { + it("autoCompact: true + workflowId set => complete() compacts from the configured dataDir with the expected CLI flags", async () => { + if (!existsSync(cliDistEntry)) { + throw new Error( + `dist CLI missing at ${cliDistEntry}. Run \`npm run build\` before executing this test.`, + ); + } + const workspaceDir = join(tempDir, "sdk-workspace"); + const recordingCliPath = join(tempDir, "recording-compact-cli.mjs"); + const recordPath = join(tempDir, "recorded-compact-invocation.json"); + writeRecordingCompactCli(recordingCliPath, cliDistEntry, recordPath); + + process.env.TRAJECTORIES_CLI = recordingCliPath; + process.env.TRAJECTORIES_WORKFLOW_ID = "wf-auto-on"; + + const { TrajectoryClient } = await import("../../src/sdk/client.js"); + const client = new TrajectoryClient({ + dataDir: workspaceDir, + autoCompact: { mechanical: true, markdown: true }, + }); + await client.init(); + + const session = await client.start("Auto-compact task"); + const sessionId = session.id; + const result = await session.complete({ + summary: "auto-compact done", + approach: "Exercise complete() directly", + confidence: 0.9, + }); + + await client.close(); + + expect(result.status).toBe("completed"); + expect(result.id).toBe(sessionId); + + const recordedInvocation = JSON.parse( + await readFile(recordPath, "utf-8"), + ) as CompactCliInvocationRecord; + expect(await realpath(recordedInvocation.cwd)).toBe( + await realpath(workspaceDir), + ); + expect(recordedInvocation.argv).toEqual([ + "compact", + "--workflow", + "wf-auto-on", + "--all", + "--markdown", + "--mechanical", + ]); + + const compactedPath = join( + workspaceDir, + ".trajectories/compacted/workflow-wf-auto-on.json", + ); + expect(existsSync(compactedPath)).toBe(true); + + const compacted = JSON.parse(await readFile(compactedPath, "utf-8")) as { + sourceTrajectories: string[]; + workflowId?: string; + }; + expect(compacted.sourceTrajectories).toContain(sessionId); + expect(compacted.workflowId).toBe("wf-auto-on"); + }, 60_000); + + it("autoCompact: true + no workflowId => complete() succeeds without compacting", async () => { + expect(process.env.TRAJECTORIES_WORKFLOW_ID).toBeUndefined(); + + const { TrajectoryClient } = await import("../../src/sdk/client.js"); + const client = new TrajectoryClient({ + autoCompact: { mechanical: true }, + }); + await client.init(); + + const session = await client.start("No-workflow auto-compact task"); + const result = await session.done("completed without workflow", 0.85); + + await client.close(); + + expect(result.status).toBe("completed"); + + const compactedDir = join(tempDir, ".trajectories", "compacted"); + if (existsSync(compactedDir)) { + const files = readdirSync(compactedDir); + const workflowFiles = files.filter((f) => f.startsWith("workflow-")); + expect(workflowFiles).toHaveLength(0); + } + }, 60_000); + + it("autoCompact: false (default) + workflowId set => complete() does NOT compact", async () => { + process.env.TRAJECTORIES_WORKFLOW_ID = "wf-default-off"; + + const { TrajectoryClient } = await import("../../src/sdk/client.js"); + const client = new TrajectoryClient(); + await client.init(); + + const session = await client.start("Default no-compact task"); + await session.done("done without compaction", 0.9); + + await client.close(); + + const compactedPath = join( + tempDir, + ".trajectories/compacted/workflow-wf-default-off.json", + ); + expect(existsSync(compactedPath)).toBe(false); + }, 60_000); + + it("autoCompact degrades gracefully if compaction fails", async () => { + process.env.TRAJECTORIES_CLI = "/nonexistent/trail"; + process.env.TRAJECTORIES_WORKFLOW_ID = "wf-fail"; + + const { TrajectoryClient } = await import("../../src/sdk/client.js"); + const client = new TrajectoryClient({ + autoCompact: { mechanical: true }, + }); + await client.init(); + + const session = await client.start("Graceful degradation task"); + const sessionId = session.id; + const result = await session.done("should still complete", 0.8); + + await client.close(); + + expect(result.status).toBe("completed"); + expect(result.id).toBe(sessionId); + + const completedDir = join(tempDir, ".trajectories", "completed"); + expect(existsSync(completedDir)).toBe(true); + const monthDirs = readdirSync(completedDir); + expect(monthDirs.length).toBeGreaterThan(0); + + let found = false; + for (const monthDir of monthDirs) { + const files = readdirSync(join(completedDir, monthDir)); + if (files.some((f) => f.includes(sessionId))) { + found = true; + break; + } + } + expect(found).toBe(true); + }, 60_000); + }); +}); diff --git a/workflows/compact-on-workflow-run.ts b/workflows/compact-on-workflow-run.ts new file mode 100644 index 0000000..1a922bf --- /dev/null +++ b/workflows/compact-on-workflow-run.ts @@ -0,0 +1,198 @@ +/** + * compact-on-workflow-run.ts + * + * Template workflow that demonstrates the auto-compaction pattern: + * + * 1. Assign one TRAJECTORIES_WORKFLOW_ID at the top of the workflow file, + * before `.run()` is called. Every child process (agents + deterministic + * steps) inherits it through process.env, so any `trail start` call or + * SDK `TrajectoryClient.start(...)` during the run gets tagged with this + * workflow id automatically. + * + * 2. Agents do their real work. Anywhere they record a trajectory — via the + * `trail` CLI or the SDK — the trajectory is stamped. No explicit ID + * plumbing per step. + * + * 3. A deterministic step at the very end runs + * `trail compact --workflow --markdown`. That filters to just this + * run's trajectories (via the `workflowId` field) and produces one tight + * .json + .md in `.trajectories/compacted/workflow-.{json,md}`. + * + * 4. No API key is ever required — compaction auto-selects a local CLI + * provider (claude, codex, gemini, or opencode) in auto mode. If no LLM + * provider is available, add `--mechanical` to the compact command to + * fall back to keyword-based compaction. + * + * To use this pattern in your own workflow: copy the top-level `WORKFLOW_ID` + * assignment and the `verify-trajectory` + `compact` + `print-artifact` steps. + * Swap the `summarize` agent step for whatever your workflow actually does. + * + * Run: agent-relay run workflows/compact-on-workflow-run.ts + */ + +import { randomUUID } from "node:crypto"; +import { workflow } from "@agent-relay/sdk/workflows"; + +// Assigned once, at module load time. This runs before any steps execute, +// so subsequent agent + deterministic processes inherit the env var. +const WORKFLOW_ID = `compact-demo-${randomUUID().slice(0, 8)}`; +process.env.TRAJECTORIES_WORKFLOW_ID = WORKFLOW_ID; + +const TRAJ_ROOT = process.cwd(); + +async function runWorkflow() { + const result = await workflow("compact-on-workflow-run") + .description( + `Template: agents record trajectories tagged with workflow id ${WORKFLOW_ID}; the final step compacts them via trail compact --workflow.`, + ) + .pattern("dag") + .channel("wf-compact-demo") + .maxConcurrency(4) + .timeout(1_200_000) + + .agent("summarizer", { + cli: "codex", + preset: "worker", + role: "Records a trajectory summarizing a small piece of the project", + retries: 1, + }) + + // ── Phase 1: Surface a tiny piece of real project context ──────── + .step("read-readme", { + type: "deterministic", + command: `sed -n '1,60p' ${TRAJ_ROOT}/README.md 2>/dev/null || echo "(no README)"`, + captureOutput: true, + failOnError: false, + }) + + // ── Phase 2: Agent does work, recording a trajectory via trail CLI + // The agent inherits TRAJECTORIES_WORKFLOW_ID=${WORKFLOW_ID} from env, + // so `trail start` tags the new trajectory automatically. + .step("summarize", { + agent: "summarizer", + dependsOn: ["read-readme"], + task: `Summarize the README excerpt below by recording a small trajectory with the \`trail\` CLI. Do NOT use any other tools — only the five trail commands listed. + +README excerpt: +{{steps.read-readme.output}} + +Exact sequence: + +1. Abandon any leftover active trajectory from a previous run (safe no-op if none): + trail abandon 2>/dev/null || true + +2. Start the new trajectory. Run exactly: + trail start "Summarize README" --quiet + This prints the new trajectory id on stdout. The trajectory is auto-tagged + with TRAJECTORIES_WORKFLOW_ID (inherited from this workflow's env). You do + not need to pass --workflow — the env var handles it. + +3. Record two decisions about the README. Run exactly twice: + trail decision "" --reasoning "" + trail decision "" --reasoning "" + +4. Record a 3-5 sentence synthesis: + trail reflect "<3-5 sentences summarizing the README — purpose, audience, key capabilities>" + +5. Complete the trajectory: + trail complete --summary "Summarized README from workflow run ${WORKFLOW_ID}" --confidence 0.9 + +Exit cleanly once step 5 returns successfully. Do not run any other commands.`, + verification: { type: "exit_code", value: "0" }, + }) + + // ── Phase 3: Verify the trajectory was tagged with our workflow id + .step("verify-trajectory", { + type: "deterministic", + dependsOn: ["summarize"], + command: `cd ${TRAJ_ROOT} && node -e ' +const { readdirSync, readFileSync, existsSync } = require("node:fs"); +const path = require("node:path"); +const completed = ".trajectories/completed"; +if (!existsSync(completed)) { + console.error("NO_COMPLETED_DIR"); + process.exit(1); +} +const walk = (dir) => { + const out = []; + for (const e of readdirSync(dir, { withFileTypes: true })) { + const p = path.join(dir, e.name); + if (e.isDirectory()) out.push(...walk(p)); + else if (e.name.endsWith(".json")) out.push(p); + } + return out; +}; +const tagged = walk(completed).filter((f) => { + try { + const t = JSON.parse(readFileSync(f, "utf-8")); + return t.workflowId === "${WORKFLOW_ID}"; + } catch { + return false; + } +}); +if (tagged.length === 0) { + console.error("NO_TRAJECTORY_TAGGED_WITH_${WORKFLOW_ID}"); + console.error("The summarizer agent did not create a workflow-tagged trajectory."); + process.exit(1); +} +console.log("tagged=" + tagged.length); +for (const f of tagged) console.log(" " + f); +'`, + captureOutput: true, + failOnError: true, + }) + + // ── Phase 4: Compact this workflow's trajectories into one artifact + // Default provider resolution selects the local CLI automatically — no + // API key, no flags. Add --mechanical at the end if you want to skip + // the LLM entirely. + .step("compact", { + type: "deterministic", + dependsOn: ["verify-trajectory"], + command: `cd ${TRAJ_ROOT} && npx tsx src/cli/index.ts compact --workflow ${WORKFLOW_ID} --markdown --all 2>&1 | tail -40`, + captureOutput: true, + failOnError: true, + }) + + // ── Phase 5: Print the compacted artifact path + a few sanity checks + .step("print-artifact", { + type: "deterministic", + dependsOn: ["compact"], + command: `cd ${TRAJ_ROOT} && node -e ' +const { readdirSync, readFileSync, statSync, existsSync } = require("node:fs"); +const path = require("node:path"); +const dir = ".trajectories/compacted"; +if (!existsSync(dir)) { console.error("NO_COMPACTED_DIR"); process.exit(1); } +const files = readdirSync(dir).filter((f) => f.startsWith("workflow-${WORKFLOW_ID}")); +if (files.length === 0) { console.error("NO_COMPACTED_ARTIFACT"); process.exit(1); } +for (const f of files) { + const p = path.join(dir, f); + const size = statSync(p).size; + console.log(p + " (" + size + " bytes)"); +} +const jsonFile = files.find((f) => f.endsWith(".json")); +if (jsonFile) { + const data = JSON.parse(readFileSync(path.join(dir, jsonFile), "utf-8")); + console.log("narrative length: " + (data.narrative?.length ?? 0)); + console.log("decisions: " + (Array.isArray(data.decisions) ? data.decisions.length : "n/a")); + console.log("sourceTrajectories: " + JSON.stringify(data.sourceTrajectories ?? [])); +} +'`, + captureOutput: true, + failOnError: false, + }) + + .onError("fail-fast") + .run({ cwd: process.cwd() }); + + console.log("Workflow status:", result.status); + console.log("Workflow id:", WORKFLOW_ID); + console.log( + `Compacted artifact: .trajectories/compacted/workflow-${WORKFLOW_ID}.{json,md}`, + ); +} + +runWorkflow().catch((error) => { + console.error(error); + process.exit(1); +}); diff --git a/workflows/llm-compaction.ts b/workflows/llm-compaction.ts new file mode 100644 index 0000000..db600e3 --- /dev/null +++ b/workflows/llm-compaction.ts @@ -0,0 +1,376 @@ +/** + * llm-compaction.ts + * + * Workflow: Replace the mechanical compaction in compact.ts with + * LLM-powered intelligent summarization. + * + * Current state: compactTrajectories() does keyword-based decision grouping + * and string deduplication. No understanding of what actually happened. + * + * Target state: An LLM reads raw trajectory data (chapters, events, decisions, + * findings, retrospectives) and produces: + * 1. A narrative summary of what was accomplished + * 2. Key decisions with real reasoning (not keyword-matched categories) + * 3. Extracted conventions/patterns that should inform future work + * 4. Lessons learned from failures/challenges + * 5. A compact .md file that's actually useful to read + * + * The LLM compaction should work with any provider (OpenAI, Anthropic, local) + * via a simple chat completion interface. + * + * Run: agent-relay run workflows/llm-compaction.ts + */ + +import { workflow } from "@agent-relay/sdk/workflows"; + +const TRAJ_ROOT = process.cwd(); + +async function main() { + const result = await workflow("llm-compaction") + .description( + "Replace mechanical trajectory compaction with LLM-powered intelligent summarization", + ) + .pattern("dag") + .channel("wf-llm-compaction") + .maxConcurrency(4) + .timeout(3_600_000) + + .agent("architect", { + cli: "claude", + role: "Designs the LLM compaction system", + }) + .agent("llm-builder", { + cli: "codex", + preset: "worker", + role: "Builds the LLM compaction engine", + }) + .agent("prompt-builder", { + cli: "codex", + preset: "worker", + role: "Builds prompts and output parsing", + }) + .agent("cli-builder", { + cli: "codex", + preset: "worker", + role: "Updates the CLI compact command", + }) + .agent("reviewer", { cli: "claude", role: "Reviews the implementation" }) + + .step("design-llm-compaction", { + agent: "architect", + task: `Design the LLM-powered trajectory compaction system. + +Read these files: +- ${TRAJ_ROOT}/src/cli/commands/compact.ts (current mechanical compaction — ~400 lines) +- ${TRAJ_ROOT}/src/core/types.ts (Trajectory, Chapter, TrajectoryEvent, Decision, Finding, Retrospective types) +- ${TRAJ_ROOT}/src/core/trajectory.ts (trajectory lifecycle) + +Current problems with compactTrajectories(): +1. Groups decisions by keyword matching ("architecture", "api", "database") — misses nuance +2. Just dedupes learnings as strings — doesn't synthesize +3. Produces a JSON blob — not a readable document +4. No understanding of what was attempted vs what worked +5. No extraction of reusable patterns/conventions + +Design the replacement: + +1. **LLM Provider Interface** (${TRAJ_ROOT}/src/compact/provider.ts): + - CompactionLLM interface: { complete(messages, options): string } + - OpenAIProvider, AnthropicProvider, LocalProvider implementations + - Config from env: TRAJECTORIES_LLM_PROVIDER, TRAJECTORIES_LLM_MODEL, API key + - Fallback: if no LLM configured, use current mechanical compaction + +2. **Trajectory Serializer** (${TRAJ_ROOT}/src/compact/serializer.ts): + - serializeForLLM(trajectories): string — converts raw trajectories to a + structured text format the LLM can read efficiently + - Strips noise (raw tool call data, low-significance events) + - Keeps: decisions, findings, errors, high-significance events, retrospectives + - Budgets tokens: truncate chapters beyond a max (configurable) + - Includes file-level context: "Files changed: src/auth.ts, src/db/schema.ts" + +3. **Compaction Prompts** (${TRAJ_ROOT}/src/compact/prompts.ts): + - COMPACTION_SYSTEM_PROMPT: role definition for the summarizer + - COMPACTION_USER_PROMPT: template with serialized trajectories + - Output format: structured JSON with narrative sections + - Prompt engineering for consistency: "You are reviewing N agent work sessions..." + +4. **Output Parser** (${TRAJ_ROOT}/src/compact/parser.ts): + - Parse LLM JSON response into CompactedTrajectory + - Validate required fields + - Fallback: if LLM returns invalid JSON, extract what we can + +5. **Compacted Output Format** — enhanced from current: + - narrative: string — 2-3 paragraph summary of what happened + - decisions: Array<{ question, chosen, reasoning, impact }> — LLM-analyzed + - conventions: Array<{ pattern, rationale, scope }> — extracted conventions + - lessons: Array<{ lesson, context, recommendation }> — synthesized learnings + - openQuestions: string[] — things left unresolved + - filesAffected: string[] — keep as-is + - commits: string[] — keep as-is + +6. **Markdown Output** (${TRAJ_ROOT}/src/compact/markdown.ts): + - Generate a readable .md file alongside the JSON + - Sections: Summary, Key Decisions, Conventions Established, Lessons Learned, Open Questions + - This is what humans actually read + +Output: interfaces, file structure, prompt outline, token budget strategy. +Keep output under 100 lines. End with DESIGN_COMPACTION_COMPLETE.`, + verification: { + type: "output_contains", + value: "DESIGN_COMPACTION_COMPLETE", + }, + timeout: 300_000, + }) + + .step("create-llm-engine", { + agent: "llm-builder", + dependsOn: ["design-llm-compaction"], + task: `Build the LLM compaction engine. + +Design: {{steps.design-llm-compaction.output}} + +Create in ${TRAJ_ROOT}/src/compact/: + +1. provider.ts — LLM provider interface + implementations: + - CompactionLLM interface: complete(messages: Message[], options?: CompletionOptions): Promise + - Message: { role: 'system' | 'user' | 'assistant', content: string } + - CompletionOptions: { maxTokens?: number, temperature?: number, jsonMode?: boolean } + - OpenAIProvider: uses fetch to POST /v1/chat/completions (no SDK dep) + Env: OPENAI_API_KEY, TRAJECTORIES_LLM_MODEL (default: gpt-4o) + - AnthropicProvider: uses fetch to POST /v1/messages + Env: ANTHROPIC_API_KEY, TRAJECTORIES_LLM_MODEL (default: claude-sonnet-4-20250514) + - resolveProvider(): auto-detect from env vars, fallback to null + - No new npm dependencies — raw fetch only + +2. serializer.ts — Trajectory → LLM-readable text: + - serializeForLLM(trajectories: Trajectory[], maxTokens?: number): string + - For each trajectory: + - Header: "## Session: {title} ({status}, {duration})" + - Agents: who participated and their roles + - Chapters: title + high/medium/critical events only (skip low) + - Decisions: full question + chosen + reasoning + - Findings: what + where + significance + - Retrospective: summary + approach + challenges + learnings + - Files changed + commits + - Token budgeting: estimate ~4 chars per token + If total > maxTokens (default 30000), truncate chapters proportionally + - Skip: raw tool call data, tool results, low-significance events + +3. index.ts — Re-export everything + +End with LLM_ENGINE_COMPLETE.`, + verification: { type: "output_contains", value: "LLM_ENGINE_COMPLETE" }, + timeout: 900_000, + }) + + .step("create-prompts-parser", { + agent: "prompt-builder", + dependsOn: ["design-llm-compaction"], + task: `Build the compaction prompts and output parser. + +Design: {{steps.design-llm-compaction.output}} + +Create in ${TRAJ_ROOT}/src/compact/: + +1. prompts.ts — Compaction prompt templates: + + COMPACTION_SYSTEM_PROMPT: + "You are a technical analyst reviewing agent work sessions (trajectories). + Your job is to produce a concise, insightful summary that captures: + - What was accomplished and how + - Key decisions and their reasoning + - Patterns/conventions established that should be followed in future work + - Lessons learned from challenges and failures + - Open questions or unresolved issues + + Be specific. Reference actual file paths, function names, and technical details. + Don't be generic — this summary replaces the raw data." + + buildCompactionPrompt(serializedTrajectories: string, options?: PromptOptions): Message[] + - Constructs system + user messages + - User message includes the serialized trajectories + - Requests structured JSON output matching CompactedOutput schema + - Includes output schema in the prompt for format guidance + + PromptOptions: { focusAreas?: string[], maxOutputTokens?: number } + +2. parser.ts — Parse LLM response: + - parseCompactionResponse(llmOutput: string): LLMCompactedOutput + - LLMCompactedOutput: { + narrative: string, + decisions: Array<{ question, chosen, reasoning, impact }>, + conventions: Array<{ pattern, rationale, scope }>, + lessons: Array<{ lesson, context, recommendation }>, + openQuestions: string[], + } + - Try JSON.parse first + - If fails: try extracting JSON from markdown code blocks + - If fails: try extracting sections from prose (regex for ## headers) + - Validate: narrative required, decisions/conventions/lessons arrays + - Merge with mechanical data (files, commits, agents) for full CompactedTrajectory + +3. markdown.ts — Generate readable .md: + - generateCompactionMarkdown(compacted: CompactedTrajectory & LLMCompactedOutput): string + - Format: + # Trajectory Compaction: {dateRange} + + ## Summary + {narrative} + + ## Key Decisions ({count}) + | Question | Decision | Impact | + |----------|----------|--------| + + ## Conventions Established + - **{pattern}**: {rationale} (scope: {scope}) + + ## Lessons Learned + - {lesson} — {recommendation} + + ## Open Questions + - {question} + + ## Stats + - Sessions: {count}, Agents: {names}, Files: {count}, Commits: {count} + - Date range: {start} - {end} + +End with PROMPTS_PARSER_COMPLETE.`, + verification: { + type: "output_contains", + value: "PROMPTS_PARSER_COMPLETE", + }, + timeout: 900_000, + }) + + .step("update-cli", { + agent: "cli-builder", + dependsOn: ["create-llm-engine", "create-prompts-parser"], + task: `Update the CLI compact command to use LLM compaction. + +Modify ${TRAJ_ROOT}/src/cli/commands/compact.ts: + +1. Add --llm flag (default: true if LLM provider detected, false otherwise) +2. Add --mechanical flag to force old behavior +3. Add --focus flag: comma-separated focus areas for the LLM +4. Add --markdown flag (default: true): also output .md file + +Updated flow: +a) Load trajectories (existing loadTrajectories — keep as-is) +b) If --mechanical or no LLM provider: use existing compactTrajectories() +c) If LLM available: + 1. serializeForLLM(trajectories) → text + 2. buildCompactionPrompt(text, options) → messages + 3. provider.complete(messages) → llmOutput + 4. parseCompactionResponse(llmOutput) → llmCompacted + 5. Merge with mechanical data (files, commits, agents) + 6. Save JSON to .trajectories/compacted/ + 7. Save .md alongside if --markdown + 8. Print summary + +d) Keep dry-run working with LLM (show prompt + estimated tokens) +e) Show cost estimate: "Estimated: ~{tokens} input tokens, ~{output} output tokens" + +Also create: +- ${TRAJ_ROOT}/src/compact/config.ts — Configuration: + - getCompactionConfig(): reads from env or .trajectories/config.json + - Config: { provider, model, maxInputTokens, maxOutputTokens, temperature } + - Defaults: provider=auto, maxInput=30000, maxOutput=4000, temperature=0.3 + +Add tests: +- ${TRAJ_ROOT}/tests/compact/llm-compact.test.ts + - Test serializer with sample trajectories + - Test parser with sample LLM output + - Test markdown generation + - Test fallback to mechanical when no LLM + +End with CLI_UPDATE_COMPLETE.`, + verification: { type: "output_contains", value: "CLI_UPDATE_COMPLETE" }, + timeout: 900_000, + }) + + .step("review-compaction", { + agent: "reviewer", + dependsOn: ["update-cli"], + task: `Review the LLM compaction system. + +Files: +- ${TRAJ_ROOT}/src/compact/provider.ts +- ${TRAJ_ROOT}/src/compact/serializer.ts +- ${TRAJ_ROOT}/src/compact/prompts.ts +- ${TRAJ_ROOT}/src/compact/parser.ts +- ${TRAJ_ROOT}/src/compact/markdown.ts +- ${TRAJ_ROOT}/src/compact/config.ts +- ${TRAJ_ROOT}/src/compact/index.ts +- ${TRAJ_ROOT}/src/cli/commands/compact.ts (modified) +- ${TRAJ_ROOT}/tests/compact/llm-compact.test.ts + +Verify: +1. No new npm dependencies (raw fetch only for LLM calls) +2. Graceful fallback: no API key → mechanical compaction +3. Token budgeting prevents exceeding model context window +4. Parser handles malformed LLM output without crashing +5. Prompt is specific enough to get useful output, not generic summaries +6. Markdown output is clean and readable +7. Dry-run shows prompt + cost estimate without calling LLM +8. Config can be set via env vars OR .trajectories/config.json +9. Existing mechanical compaction still works with --mechanical flag +10. Tests cover serializer, parser, markdown, and fallback + +Fix issues. Keep output under 50 lines. End with COMPACTION_REVIEW_COMPLETE.`, + verification: { + type: "output_contains", + value: "COMPACTION_REVIEW_COMPLETE", + }, + timeout: 300_000, + }) + + .step("commit", { + agent: "llm-builder", + dependsOn: ["review-compaction"], + task: `In ${TRAJ_ROOT}: +1. git checkout -b feat/llm-compaction +2. git add src/compact/ src/cli/commands/compact.ts tests/compact/ +3. git commit -m "feat: LLM-powered trajectory compaction + +Replaces mechanical keyword-based compaction with intelligent LLM summarization. + +New compact/ module: + - provider.ts: OpenAI + Anthropic providers (raw fetch, no deps) + - serializer.ts: trajectory → LLM-readable text with token budgeting + - prompts.ts: system + user prompts for compaction + - parser.ts: parse LLM JSON output with fallbacks + - markdown.ts: generate readable .md summaries + - config.ts: env vars or .trajectories/config.json + +CLI updates: + - trail compact now uses LLM by default (if API key present) + - --mechanical flag for old behavior + - --focus for targeted summaries + - --markdown flag (default: true) for .md output + - Dry-run shows prompt + cost estimate + +Output includes: + - Narrative summary (what happened, how) + - Key decisions with reasoning and impact + - Extracted conventions/patterns for future work + - Synthesized lessons from challenges + - Open questions / unresolved issues + +Backwards compatible: falls back to mechanical if no LLM provider." +4. git push origin feat/llm-compaction + +Report commit hash. End with COMMIT_COMPLETE.`, + verification: { type: "output_contains", value: "COMMIT_COMPLETE" }, + timeout: 120_000, + }) + + .onError("retry", { maxRetries: 1, retryDelayMs: 10_000 }) + .run({ cwd: process.cwd() }); + + console.log("LLM compaction complete:", result.status); +} + +main().catch((error) => { + console.error(error); + process.exitCode = 1; +}); diff --git a/workflows/sdk-autocompact-option.ts b/workflows/sdk-autocompact-option.ts new file mode 100644 index 0000000..4014d07 --- /dev/null +++ b/workflows/sdk-autocompact-option.ts @@ -0,0 +1,522 @@ +/** + * sdk-autocompact-option.ts + * + * Adds an `autoCompact` option to TrajectoryClient so that + * session.complete() / session.done() automatically runs compactWorkflow() + * when the trajectory has a workflowId. This removes the need for a + * separate compact step in any relay workflow that uses the SDK — users + * just call complete() and the compacted artifact appears at + * .trajectories/compacted/workflow-.{json,md}. + * + * Validation strategy (80 -> 100): + * 1. BEFORE: deterministic tsx probe creates a trajectory with + * TRAJECTORIES_WORKFLOW_ID set, calls complete(), and asserts NO + * compacted file was produced. Proves autoCompact doesn't silently + * exist today. + * 2. Implement: codex edits src/sdk/client.ts to add the option and + * wire it through TrajectorySession.complete() + done(). + * 3. Tests: claude adds vitest cases covering the 4 autoCompact + * permutations (true/false x workflowId present/absent), plus a + * graceful-failure case. + * 4. AFTER: deterministic tsx probe with autoCompact: true asserts + * the compacted file DOES exist and has narrative + decisions. + * 5. Hard gate: the BEFORE file must not exist, the AFTER file must + * exist and be non-trivial. + * 6. Self-review (codex) + peer-review (claude) with diff as input. + * 7. Full regression + typecheck + commit + push to PR. + * + * Team split (per relay-80-100 + writing-agent-relay-workflows skills): + * - impl (codex, worker preset): edits client.ts + * - tester (claude, worker preset): writes vitest cases, runs + * before/after probes, iterates on failures + * - reviewer (claude, worker preset): peer-reviews the diff + * - self-reviewer (codex, worker preset): self-reviews its own diff + * + * All agents use `preset: 'worker'` to avoid the interactive-claude PTY + * hang we hit in the last implementation workflow. + * + * Run: agent-relay run workflows/sdk-autocompact-option.ts + */ + +import { workflow } from "@agent-relay/sdk/workflows"; + +const TRAJ_ROOT = process.cwd(); + +async function runWorkflow() { + const result = await workflow("sdk-autocompact-option") + .description( + "Add autoCompact option to TrajectoryClient so session.complete() auto-runs compactWorkflow() when a workflowId is present", + ) + .pattern("dag") + .channel("wf-autocompact-option") + .maxConcurrency(5) + .timeout(2_400_000) + + .agent("impl", { + cli: "codex", + preset: "worker", + role: "Implements the autoCompact option in TrajectoryClient", + retries: 2, + }) + .agent("tester", { + cli: "claude", + preset: "worker", + role: "Writes tests and runs the before/after probes", + retries: 2, + }) + .agent("reviewer", { + cli: "claude", + preset: "worker", + role: "Peer-reviews the implementation diff", + retries: 1, + }) + .agent("self-reviewer", { + cli: "codex", + preset: "worker", + role: "Self-reviews the implementation diff", + retries: 1, + }) + + // ── Phase 0: Clean workspace + build dist ─────────────────────── + .step("clean-workspace", { + type: "deterministic", + command: `rm -rf ${TRAJ_ROOT}/.trajectories-test/autocompact && mkdir -p ${TRAJ_ROOT}/.trajectories-test/autocompact/before ${TRAJ_ROOT}/.trajectories-test/autocompact/after && echo OK`, + captureOutput: true, + failOnError: true, + }) + // Build dist/cli/index.js so the SDK's compactWorkflow() can resolve + // the trail binary. Without this the probe spawns "trail" on PATH, + // which is not installed in a dev checkout — compaction then fails + // silently (graceful degradation kicks in), and autoCompact looks + // like a no-op when it's actually the harness that's broken. + .step("build-dist", { + type: "deterministic", + dependsOn: ["clean-workspace"], + command: `cd ${TRAJ_ROOT} && npm run build 2>&1 | tail -10 && test -f dist/cli/index.js && echo "BUILD_OK"`, + captureOutput: true, + failOnError: true, + }) + + // ── Phase 1: Read context files ───────────────────────────────── + .step("read-client", { + type: "deterministic", + dependsOn: ["build-dist"], + command: `cat ${TRAJ_ROOT}/src/sdk/client.ts`, + captureOutput: true, + }) + .step("read-tests", { + type: "deterministic", + dependsOn: ["build-dist"], + command: `cat ${TRAJ_ROOT}/tests/sdk/workflow-compact.test.ts`, + captureOutput: true, + }) + + // ── Phase 2: BEFORE probe — prove autoCompact doesn't exist yet ─ + // The probe creates a trajectory with a workflowId, calls complete(), + // and asserts no compacted file was produced. On the clean tree this + // MUST succeed (there is no autoCompact today, so nothing happens on + // complete). This locks in the "before" baseline. + .step("before-probe", { + type: "deterministic", + dependsOn: ["read-client"], + command: `cd ${TRAJ_ROOT}/.trajectories-test/autocompact/before && TRAJECTORIES_WORKFLOW_ID=wf-before TRAJECTORIES_CLI=${TRAJ_ROOT}/dist/cli/index.js npx tsx ${TRAJ_ROOT}/scripts/autocompact-probe.mts 2>&1 && node -e ' +const { existsSync, readdirSync } = require("node:fs"); +const compactedDir = ".trajectories/compacted"; +const files = existsSync(compactedDir) ? readdirSync(compactedDir) : []; +const matches = files.filter((f) => f.includes("workflow-wf-before")); +if (matches.length > 0) { + console.error("BEFORE_FAILED: expected no compacted file, found " + matches.join(", ")); + process.exit(1); +} +console.log("BEFORE_OK: no compacted artifact produced (as expected pre-feature / opt-in off)"); +'`, + captureOutput: true, + failOnError: true, + }) + + // ── Phase 3: Implementation ───────────────────────────────────── + .step("edit-client", { + agent: "impl", + dependsOn: ["before-probe", "read-client"], + task: `Edit ${TRAJ_ROOT}/src/sdk/client.ts to add an \`autoCompact\` option on TrajectoryClient so that session.complete() / session.done() automatically invoke compactWorkflow() when the trajectory has a workflowId. + +Current contents: +{{steps.read-client.output}} + +Required changes: + +1. Extend \`TrajectoryClientOptions\` with an OPTIONAL field: + autoCompact?: boolean | { mechanical?: boolean; markdown?: boolean }; + Document it with a JSDoc comment: "When set, session.complete() and session.done() automatically run compactWorkflow() against the trajectory's workflowId. Default false. Pass an object to control the flags passed to the CLI — e.g. { mechanical: true } skips the LLM for deterministic compaction, { markdown: false } skips the .md companion." + +2. Store the option on TrajectoryClient: + - Add a \`private readonly autoCompact: false | { mechanical: boolean; markdown: boolean }\` field. + - In the constructor, normalize: \`false\` when unset/false, otherwise \`{ mechanical: ..., markdown: ... }\` with defaults \`mechanical: false, markdown: true\`. + - Expose a getter or method on TrajectoryClient that TrajectorySession can read (e.g. \`getAutoCompactOptions(): false | { mechanical: boolean; markdown: boolean }\`). + +3. Update TrajectorySession so that after complete() saves the trajectory, if the stored autoCompact is not false AND this.trajectory.workflowId is set, call: + await compactWorkflow(this.trajectory.workflowId, autoCompactOptions); + Wrap the call in try/catch — on failure, log a warning to console.error but do NOT throw. complete() must still return the completed trajectory cleanly. Failure here is a degradation, not a hard error (the raw trajectory is already saved). + +4. Do the same in session.done() by routing through complete() (or by calling compactWorkflow directly if done() has its own path). Make sure both entry points trigger autoCompact. + +5. Do NOT change the signature of complete() / done(). Do NOT move compactWorkflow into the core path. Keep the SDK's role as "tag + shell out" intact. + +6. autoCompact is purely opt-in. When \`new TrajectoryClient()\` is called with no options, behavior must be unchanged — no compaction fires on complete(). + +Only edit src/sdk/client.ts. No other files.`, + verification: { type: "exit_code", value: "0" }, + }) + .step("verify-edit-client", { + type: "deterministic", + dependsOn: ["edit-client"], + command: `cd ${TRAJ_ROOT} && if git diff --quiet src/sdk/client.ts; then echo NOT_MODIFIED; exit 1; fi && grep -q "autoCompact" src/sdk/client.ts && echo OK`, + failOnError: true, + captureOutput: true, + }) + + // ── Phase 4: Typecheck ────────────────────────────────────────── + .step("typecheck", { + type: "deterministic", + dependsOn: ["verify-edit-client"], + command: `cd ${TRAJ_ROOT} && npm run typecheck 2>&1 | tail -40; echo "EXIT: $?"`, + captureOutput: true, + failOnError: false, + }) + .step("fix-typecheck", { + agent: "impl", + dependsOn: ["typecheck"], + task: `If the typecheck output below shows errors (non-zero EXIT), fix them. If EXIT: 0 and no errors, do nothing. + +Output: +{{steps.typecheck.output}} + +Only edit src/sdk/client.ts. Re-run \`npm run typecheck\` until it passes.`, + verification: { type: "exit_code", value: "0" }, + }) + .step("typecheck-final", { + type: "deterministic", + dependsOn: ["fix-typecheck"], + command: `cd ${TRAJ_ROOT} && npm run typecheck 2>&1 | tail -20`, + captureOutput: true, + failOnError: true, + }) + + // ── Phase 5: Tests ────────────────────────────────────────────── + .step("add-tests", { + agent: "tester", + dependsOn: ["typecheck-final", "read-tests"], + task: `Extend ${TRAJ_ROOT}/tests/sdk/workflow-compact.test.ts with new vitest cases covering the autoCompact option. Do NOT touch existing tests. + +Current file: +{{steps.read-tests.output}} + +Add these four cases inside the existing \`describe("workflow compaction", ...)\` block (or a nested describe — your call): + +1. "autoCompact: true + workflowId set => complete() produces a compacted file" + - process.env.TRAJECTORIES_WORKFLOW_ID = "wf-auto-on" + - const client = new TrajectoryClient({ autoCompact: { mechanical: true, markdown: true } }) + - await client.init(); start a session; call session.done("...", 0.9) + - Assert .trajectories/compacted/workflow-wf-auto-on.json exists and its sourceTrajectories contains the session id + +2. "autoCompact: true + no workflowId => complete() succeeds without compacting" + - TRAJECTORIES_WORKFLOW_ID unset + - Same client config + - Assert complete() returns normally AND .trajectories/compacted has no workflow-* files + +3. "autoCompact: false (default) + workflowId set => complete() does NOT compact (backwards compat)" + - process.env.TRAJECTORIES_WORKFLOW_ID = "wf-default-off" + - const client = new TrajectoryClient() (no autoCompact option) + - Assert no compacted file exists after complete() + +4. "autoCompact degrades gracefully if compaction fails" + - Point TRAJECTORIES_CLI at a path that doesn't exist: process.env.TRAJECTORIES_CLI = "/nonexistent/trail" + - process.env.TRAJECTORIES_WORKFLOW_ID = "wf-fail" + - autoCompact: true + - Assert session.done() still resolves to a completed Trajectory (does NOT throw) + - Assert the raw trajectory is saved to disk + +Use the existing helpers (clearEnv, snapshotEnv, restoreEnv, tempDir cwd). Use \`{ mechanical: true }\` in autoCompact so tests don't hit a real LLM. + +Only edit tests/sdk/workflow-compact.test.ts.`, + verification: { type: "exit_code", value: "0" }, + }) + .step("verify-add-tests", { + type: "deterministic", + dependsOn: ["add-tests"], + command: `cd ${TRAJ_ROOT} && if git diff --quiet tests/sdk/workflow-compact.test.ts; then echo NOT_MODIFIED; exit 1; fi && grep -qc "autoCompact" tests/sdk/workflow-compact.test.ts && echo OK`, + failOnError: true, + captureOutput: true, + }) + + .step("run-tests", { + type: "deterministic", + dependsOn: ["verify-add-tests"], + command: `cd ${TRAJ_ROOT} && npx vitest run tests/sdk/workflow-compact.test.ts 2>&1 | tail -80; echo "EXIT: $?"`, + captureOutput: true, + failOnError: false, + }) + .step("fix-tests", { + agent: "tester", + dependsOn: ["run-tests"], + task: `If the test output below shows failures (non-zero EXIT or FAIL lines), fix them. If all passed, do nothing. + +Test output: +{{steps.run-tests.output}} + +You may edit tests/sdk/workflow-compact.test.ts OR src/sdk/client.ts to fix real bugs. Re-run \`npx vitest run tests/sdk/workflow-compact.test.ts\` until green.`, + verification: { type: "exit_code", value: "0" }, + }) + .step("run-tests-final", { + type: "deterministic", + dependsOn: ["fix-tests"], + command: `cd ${TRAJ_ROOT} && npx vitest run tests/sdk/workflow-compact.test.ts 2>&1 | tail -60`, + captureOutput: true, + failOnError: true, + }) + + // ── Phase 6: AFTER probe — prove autoCompact now works ────────── + .step("after-probe", { + type: "deterministic", + dependsOn: ["run-tests-final"], + command: `cd ${TRAJ_ROOT}/.trajectories-test/autocompact/after && TRAJECTORIES_WORKFLOW_ID=wf-after PROBE_AUTOCOMPACT=true TRAJECTORIES_CLI=${TRAJ_ROOT}/dist/cli/index.js npx tsx ${TRAJ_ROOT}/scripts/autocompact-probe.mts 2>&1 && node -e ' +const { existsSync, readdirSync, readFileSync, statSync } = require("node:fs"); +const { join } = require("node:path"); +const compactedDir = ".trajectories/compacted"; +if (!existsSync(compactedDir)) { + console.error("AFTER_FAILED: .trajectories/compacted not created"); + process.exit(1); +} +const files = readdirSync(compactedDir).filter((f) => f.includes("workflow-wf-after")); +if (files.length < 1) { + console.error("AFTER_FAILED: no workflow-wf-after artifact"); + process.exit(1); +} +const jsonFile = files.find((f) => f.endsWith(".json")); +const mdFile = files.find((f) => f.endsWith(".md")); +if (!jsonFile) { console.error("AFTER_FAILED: no .json artifact"); process.exit(1); } +if (!mdFile) { console.error("AFTER_FAILED: no .md artifact"); process.exit(1); } +const data = JSON.parse(readFileSync(join(compactedDir, jsonFile), "utf-8")); +const jsonBytes = statSync(join(compactedDir, jsonFile)).size; +const mdBytes = statSync(join(compactedDir, mdFile)).size; +console.log("AFTER_OK: " + jsonFile + " (" + jsonBytes + "B), " + mdFile + " (" + mdBytes + "B)"); +console.log("sourceTrajectories: " + JSON.stringify(data.sourceTrajectories)); +console.log("decisions: " + (Array.isArray(data.decisions) ? data.decisions.length : "n/a")); +if (!Array.isArray(data.sourceTrajectories) || data.sourceTrajectories.length < 1) { + console.error("AFTER_FAILED: sourceTrajectories empty"); + process.exit(1); +} +'`, + captureOutput: true, + failOnError: true, + }) + + // ── Phase 7: Hard BEFORE/AFTER gate ───────────────────────────── + .step("before-after-gate", { + type: "deterministic", + dependsOn: ["after-probe", "before-probe"], + command: `cd ${TRAJ_ROOT} && node -e ' +const { existsSync, readdirSync } = require("node:fs"); +const beforeDir = ".trajectories-test/autocompact/before/.trajectories/compacted"; +const afterDir = ".trajectories-test/autocompact/after/.trajectories/compacted"; +const failures = []; +if (existsSync(beforeDir)) { + const beforeFiles = readdirSync(beforeDir).filter((f) => f.includes("workflow-wf-before")); + if (beforeFiles.length > 0) failures.push("BEFORE produced compacted files: " + beforeFiles.join(", ")); +} +if (!existsSync(afterDir)) failures.push("AFTER did not create compacted dir"); +else { + const afterFiles = readdirSync(afterDir).filter((f) => f.includes("workflow-wf-after")); + if (afterFiles.length < 2) failures.push("AFTER expected .json + .md, found " + afterFiles.length + ": " + afterFiles.join(", ")); +} +if (failures.length) { + console.error("BEFORE/AFTER GATE FAILED:"); + for (const f of failures) console.error(" - " + f); + process.exit(1); +} +console.log("BEFORE/AFTER GATE PASSED: autoCompact is a real behavior change"); +'`, + captureOutput: true, + failOnError: true, + }) + + // ── Phase 8: Self-review + peer review ────────────────────────── + .step("capture-diff", { + type: "deterministic", + dependsOn: ["before-after-gate"], + command: `cd ${TRAJ_ROOT} && git diff src/sdk/client.ts tests/sdk/workflow-compact.test.ts 2>&1 | head -500`, + captureOutput: true, + failOnError: false, + }) + .step("self-review", { + agent: "self-reviewer", + dependsOn: ["capture-diff"], + task: `You wrote this implementation. Self-review the diff below for correctness, edge cases, and adherence to the brief. + +Diff: +{{steps.capture-diff.output}} + +Checklist: +- autoCompact is opt-in: new TrajectoryClient() with no options MUST NOT trigger compaction +- When autoCompact.mechanical is passed to compactWorkflow(), the CLI spawn really includes --mechanical +- Failures in compactWorkflow() are swallowed (logged, not thrown) so complete() never rejects +- complete() still saves the raw trajectory BEFORE attempting compaction (so failure doesn't lose work) +- Both session.complete() and session.done() trigger autoCompact +- No other files touched + +Write findings to ${TRAJ_ROOT}/.trajectories-test/autocompact/self-review.md. If all clear, the file must contain the single line "SELF_REVIEW_CLEAR". Otherwise list concrete issues.`, + verification: { + type: "file_exists", + value: ".trajectories-test/autocompact/self-review.md", + }, + }) + .step("peer-review", { + agent: "reviewer", + dependsOn: ["capture-diff"], + task: `Peer-review the implementation diff below. You did NOT write it. + +Diff: +{{steps.capture-diff.output}} + +Focus on: +1. Opt-in safety: does default behavior change? A user who does not pass autoCompact must see identical behavior to before. +2. Error handling: compactWorkflow can throw (missing CLI, subprocess failure). Does complete() swallow errors but log them? +3. Ordering: the raw trajectory MUST be saved before compaction runs — otherwise a compaction failure could lose the user's work. +4. Tests hit both success and failure paths? Tests avoid real LLM calls (mechanical: true)? +5. The SDK still shells out to trail compact for compaction — it does not re-implement compaction inline? + +Write findings to ${TRAJ_ROOT}/.trajectories-test/autocompact/peer-review.md. End with "PEER_REVIEW_APPROVED" on its own line if acceptable. Otherwise list blocking issues.`, + verification: { + type: "file_exists", + value: ".trajectories-test/autocompact/peer-review.md", + }, + }) + + .step("address-review", { + agent: "impl", + dependsOn: ["self-review", "peer-review"], + task: `Read both review files and address any blocking issues. If both end with SELF_REVIEW_CLEAR / PEER_REVIEW_APPROVED and no blocking items, do nothing. + +Self-review: ${TRAJ_ROOT}/.trajectories-test/autocompact/self-review.md +Peer-review: ${TRAJ_ROOT}/.trajectories-test/autocompact/peer-review.md + +Only edit: src/sdk/client.ts, tests/sdk/workflow-compact.test.ts. + +After your edits, re-run both: + npm run typecheck + npx vitest run tests/sdk/workflow-compact.test.ts +until both pass.`, + verification: { type: "exit_code", value: "0" }, + }) + + // ── Phase 9: Final gates ──────────────────────────────────────── + .step("tests-after-review", { + type: "deterministic", + dependsOn: ["address-review"], + command: `cd ${TRAJ_ROOT} && npx vitest run tests/sdk/workflow-compact.test.ts 2>&1 | tail -60`, + captureOutput: true, + failOnError: true, + }) + .step("typecheck-after-review", { + type: "deterministic", + dependsOn: ["address-review"], + command: `cd ${TRAJ_ROOT} && npm run typecheck 2>&1 | tail -20`, + captureOutput: true, + failOnError: true, + }) + .step("regression-tests", { + type: "deterministic", + dependsOn: ["tests-after-review", "typecheck-after-review"], + command: `cd ${TRAJ_ROOT} && npm run test:run 2>&1 | tail -40; echo "REG_EXIT: $?"`, + captureOutput: true, + failOnError: false, + }) + .step("fix-regressions", { + agent: "impl", + dependsOn: ["regression-tests"], + task: `If existing tests broke, fix only the regressions caused by our changes. If all passed, do nothing. + +Regression output: +{{steps.regression-tests.output}} + +Only edit: src/sdk/client.ts, tests/sdk/workflow-compact.test.ts. Re-run \`npm run test:run\` until green.`, + verification: { type: "exit_code", value: "0" }, + }) + .step("regression-final", { + type: "deterministic", + dependsOn: ["fix-regressions"], + command: `cd ${TRAJ_ROOT} && npm run test:run 2>&1 | tail -30`, + captureOutput: true, + failOnError: true, + }) + + // ── Phase 10: Commit + push to PR ─────────────────────────────── + .step("commit", { + type: "deterministic", + dependsOn: ["regression-final"], + command: `cd ${TRAJ_ROOT} && git add src/sdk/client.ts tests/sdk/workflow-compact.test.ts workflows/sdk-autocompact-option.ts scripts/autocompact-probe.mts && git commit -m "feat(sdk): autoCompact option on TrajectoryClient auto-runs compactWorkflow on complete + +When TrajectoryClient is constructed with autoCompact: true (or an +options object with mechanical/markdown overrides) and the trajectory +has a workflowId stamped, session.complete() and session.done() will +automatically shell out to trail compact --workflow after saving +the raw trajectory. The compacted artifact appears at +.trajectories/compacted/workflow-.{json,md}. + +This removes the need for a separate compact step in any SDK consumer +running under a relay workflow — just set TRAJECTORIES_WORKFLOW_ID in +the environment and construct the client with autoCompact: true, and +complete() produces the tight artifact as a side effect. + +- autoCompact is opt-in: default behavior unchanged +- Compaction failures are logged but do NOT fail complete() — the raw + trajectory is always saved first +- Backed by a BEFORE/AFTER validation workflow under workflows/ +- Tests cover all four permutations plus graceful failure" 2>&1`, + captureOutput: true, + failOnError: true, + }) + .step("push", { + type: "deterministic", + dependsOn: ["commit"], + command: `cd ${TRAJ_ROOT} && branch=$(git rev-parse --abbrev-ref HEAD) && git push origin "$branch" 2>&1`, + captureOutput: true, + failOnError: true, + }) + .step("comment-pr", { + type: "deterministic", + dependsOn: ["push"], + command: `cd ${TRAJ_ROOT} && branch=$(git rev-parse --abbrev-ref HEAD) && pr=$(gh pr list --head "$branch" --json number --jq '.[0].number' 2>/dev/null) && if [ -n "$pr" ]; then + gh pr comment "$pr" --body "## sdk-autocompact-option — validated end-to-end + +BEFORE/AFTER gate PASSED. The feature is a genuine behavior change, not a no-op: + +- **BEFORE**: \\\`new TrajectoryClient()\\\` + session.done() produces NO compacted file (baseline locked). +- **AFTER**: \\\`new TrajectoryClient({ autoCompact: { mechanical: true } })\\\` + session.done() with TRAJECTORIES_WORKFLOW_ID set automatically produces \\\`.trajectories/compacted/workflow-.{json,md}\\\`. + +Ran via \\\`agent-relay run workflows/sdk-autocompact-option.ts\\\` with codex impl + claude tests + claude peer review + codex self-review. +" 2>&1 +else + echo "No open PR for branch $branch — skipping comment" +fi`, + captureOutput: true, + failOnError: false, + }) + .step("print-summary", { + type: "deterministic", + dependsOn: ["comment-pr"], + command: `cd ${TRAJ_ROOT} && echo "=== COMMIT ===" && git log -1 --oneline && echo "=== PR ===" && (gh pr list --head "$(git rev-parse --abbrev-ref HEAD)" --json url --jq '.[0].url' || true)`, + captureOutput: true, + failOnError: false, + }) + + .onError("retry", { maxRetries: 1, retryDelayMs: 10_000 }) + .run({ cwd: process.cwd() }); + + console.log("Workflow status:", result.status); +} + +runWorkflow().catch((error) => { + console.error(error); + process.exit(1); +}); diff --git a/workflows/sdk-workflow-autocompact.ts b/workflows/sdk-workflow-autocompact.ts new file mode 100644 index 0000000..039d6e7 --- /dev/null +++ b/workflows/sdk-workflow-autocompact.ts @@ -0,0 +1,786 @@ +/** + * sdk-workflow-autocompact.ts + * + * Adds workflow-aware auto-compaction to the trajectories SDK + trail CLI. + * + * Feature: + * - Trajectory type gains an optional workflowId field + * - TrajectoryClient reads TRAJECTORIES_WORKFLOW_ID from env and stamps + * every trajectory it creates with that id + * - New SDK helper: compactWorkflow(workflowId) — spawns the local `trail` + * CLI so compaction logic stays in one place (the local CLI) + * - `trail compact --workflow ` filter selects all trajectories for a + * given workflow run and produces a single compacted JSON + markdown + * + * Validation strategy (80 → 100): + * 1. BEFORE: run a reproducible benchmark script with the current SDK and + * capture raw size / event counts. The new workflowId field is absent. + * 2. Implement the feature (codex). + * 3. Tests (claude) — unit tests for env-var tagging + real CLI invocation + * through compactWorkflow(). + * 4. AFTER: run the same benchmark with TRAJECTORIES_WORKFLOW_ID set, then + * invoke `trail compact --workflow --markdown`. Compaction uses the + * locally-installed claude or codex CLI — no API key ever required. + * 5. Hard gate: compacted bytes meaningfully smaller than raw, markdown + * present, narrative + decisions fields populated. + * + * LLM provider strategy (user never sets an API key): + * - Default provider order is reordered so "auto" tries the CLI provider + * FIRST. Supported CLIs: claude, codex, gemini, opencode (declaration + * order = preference). All reuse existing CLI auth — no keys needed. + * - TRAJECTORIES_LLM_CLI=claude|codex|gemini|opencode pins the CLI when + * multiple are installed. + * - OpenAI / Anthropic API providers remain available but only kick in if + * explicitly selected via TRAJECTORIES_LLM_PROVIDER=openai|anthropic. + * - Unit tests use --mechanical for determinism, speed, no LLM subprocess. + * - The supported-CLI list is kept inline (not imported from agent-relay) + * because that dep was removed in 7e9783c. See provider.ts JSDoc. + * 6. Self-review (codex) + peer review (claude) with diff as input. + * 7. Address review feedback; re-run tests + regression; commit. + * 8. Push to origin (updates the open PR on this branch) and comment the + * before/after stats on the PR via `gh pr comment`. + * + * Team split (per relay-80-100 skill): + * - Codex implements SDK + CLI edits (impl) + * - Claude writes tests and runs before/after comparison (tester) + * - Claude peer-reviews the diff (reviewer) + * - Codex self-reviews its own diff (self-reviewer) + * + * Run: agent-relay run workflows/sdk-workflow-autocompact.ts + */ + +import { workflow } from "@agent-relay/sdk/workflows"; + +const TRAJ_ROOT = process.cwd(); + +async function runWorkflow() { + const result = await workflow("sdk-workflow-autocompact") + .description( + "Add workflow-aware auto-compaction to the trajectories SDK and trail CLI, with end-to-end before/after validation", + ) + .pattern("dag") + .channel("wf-autocompact") + .maxConcurrency(6) + .timeout(3_600_000) + + .agent("impl", { + cli: "codex", + preset: "worker", + role: "Implements SDK + CLI edits one file at a time", + retries: 2, + }) + .agent("tester", { + cli: "claude", + preset: "worker", + role: "Writes tests and runs the before/after E2E comparison", + retries: 2, + }) + .agent("reviewer", { + cli: "claude", + preset: "worker", + role: "Peer-reviews the implementation diff", + retries: 1, + }) + .agent("self-reviewer", { + cli: "codex", + preset: "worker", + role: "Self-reviews the implementation it wrote", + retries: 1, + }) + + // ── Phase 0: Clean workspace ───────────────────────────────────── + .step("clean-workspace", { + type: "deterministic", + command: `rm -rf ${TRAJ_ROOT}/.trajectories-test && mkdir -p ${TRAJ_ROOT}/.trajectories-test/before ${TRAJ_ROOT}/.trajectories-test/after && echo OK`, + captureOutput: true, + failOnError: true, + }) + + // ── Phase 1: Read context for downstream edits ─────────────────── + .step("read-types", { + type: "deterministic", + dependsOn: ["clean-workspace"], + command: `cat ${TRAJ_ROOT}/src/core/types.ts`, + captureOutput: true, + }) + .step("read-client", { + type: "deterministic", + dependsOn: ["clean-workspace"], + command: `cat ${TRAJ_ROOT}/src/sdk/client.ts`, + captureOutput: true, + }) + .step("read-compact-cmd", { + type: "deterministic", + dependsOn: ["clean-workspace"], + command: `sed -n '1,400p' ${TRAJ_ROOT}/src/cli/commands/compact.ts`, + captureOutput: true, + }) + .step("read-sdk-index", { + type: "deterministic", + dependsOn: ["clean-workspace"], + command: `cat ${TRAJ_ROOT}/src/sdk/index.ts 2>/dev/null || echo "(no sdk/index.ts)"`, + captureOutput: true, + }) + .step("read-provider", { + type: "deterministic", + dependsOn: ["clean-workspace"], + command: `sed -n '1,500p' ${TRAJ_ROOT}/src/compact/provider.ts`, + captureOutput: true, + }) + .step("read-schema", { + type: "deterministic", + dependsOn: ["clean-workspace"], + command: `sed -n '1,120p' ${TRAJ_ROOT}/src/core/schema.ts`, + captureOutput: true, + }) + + // ── Phase 2: Create the reusable benchmark sample script ───────── + .step("create-benchmark-script", { + agent: "tester", + dependsOn: ["read-client", "read-sdk-index"], + task: `Create ${TRAJ_ROOT}/scripts/benchmark-compaction.ts — a reproducible benchmark that uses the CURRENT trajectories SDK to record a deliberately noisy sample trajectory. + +Storage convention — IMPORTANT: +- The script must use the default TrajectoryClient storage: it always writes into \`/.trajectories/\`. Do NOT read TRAJECTORIES_DIR or any custom env var for the base directory. The CALLER cd's into the right directory before invoking this script; that is how isolation works. +- The script MUST read TRAJECTORIES_WORKFLOW_ID from process.env and pass it through to the trajectory (via whatever option TrajectoryClient.start accepts, or the env var will naturally be picked up by the SDK once the feature is implemented — on the BEFORE run this is a no-op and that is expected). + +Content (deliberately noisy so the compaction ratio is meaningful): +- One session with 5 chapters, at least 8 decisions, 15 findings, and 45+ low-significance tool_call / tool_result / thinking events (this is the noise the compactor strips — small fixtures produce weak ratios, so keep the noise volume high) +- Call session.complete(...) with a full retrospective +- Print to stdout exactly once: \`TRAJECTORY_ID=\` + +Use only APIs that already exist in src/sdk/client.ts today: +{{steps.read-client.output}} + +And the current sdk/index exports: +{{steps.read-sdk-index.output}} + +Write the file to disk. Do NOT log to stdout instead of writing. Keep under 120 lines.`, + verification: { + type: "file_exists", + value: "scripts/benchmark-compaction.ts", + }, + }) + .step("verify-benchmark-script", { + type: "deterministic", + dependsOn: ["create-benchmark-script"], + command: `test -f ${TRAJ_ROOT}/scripts/benchmark-compaction.ts && wc -l ${TRAJ_ROOT}/scripts/benchmark-compaction.ts | awk '{ if ($1 < 20) { print "TOO_SHORT"; exit 1 } else { print "OK " $1 " lines" } }'`, + failOnError: true, + captureOutput: true, + }) + + // ── Phase 3: BEFORE capture ───────────────────────────────────── + .step("before-run", { + type: "deterministic", + dependsOn: ["verify-benchmark-script"], + command: `cd ${TRAJ_ROOT}/.trajectories-test/before && npx tsx ${TRAJ_ROOT}/scripts/benchmark-compaction.ts 2>&1 | tee run.log`, + captureOutput: true, + failOnError: true, + }) + .step("before-stats", { + type: "deterministic", + dependsOn: ["before-run"], + command: `cd ${TRAJ_ROOT} && node -e ' +const fs = require("fs"); +const path = require("path"); +const root = ".trajectories-test/before"; +function walk(dir) { + const out = []; + if (!fs.existsSync(dir)) return out; + for (const e of fs.readdirSync(dir, { withFileTypes: true })) { + const p = path.join(dir, e.name); + if (e.isDirectory()) out.push(...walk(p)); + else if (e.name.endsWith(".json") && !p.includes("/compacted/")) out.push(p); + } + return out; +} +const files = walk(root); +if (files.length === 0) { console.error("NO_RAW_TRAJECTORY"); process.exit(1); } +const raw = files[0]; +const bytes = fs.statSync(raw).size; +const data = JSON.parse(fs.readFileSync(raw, "utf8")); +let events = 0; +for (const c of data.chapters || []) events += (c.events || []).length; +const stats = { + raw_file: raw, + raw_bytes: bytes, + event_count: events, + has_workflow_id: Object.prototype.hasOwnProperty.call(data, "workflowId") ? 1 : 0, +}; +fs.writeFileSync(".trajectories-test/before/stats.json", JSON.stringify(stats, null, 2)); +console.log(JSON.stringify(stats)); +'`, + captureOutput: true, + failOnError: true, + }) + + // ── Phase 4: Implementation — one file per step ───────────────── + .step("edit-types", { + agent: "impl", + dependsOn: ["before-stats", "read-types"], + task: `Edit ${TRAJ_ROOT}/src/core/types.ts. + +Current contents: +{{steps.read-types.output}} + +Add an OPTIONAL field \`workflowId?: string\` to the Trajectory interface with a one-line JSDoc: "Opaque id set by the workflow runner via TRAJECTORIES_WORKFLOW_ID env var. Lets trail compact --workflow collate all trajectories from a single workflow run." If CreateTrajectoryInput (or the equivalent constructor input type) exists in this file, add workflowId there too. + +Only edit this one file.`, + verification: { type: "exit_code", value: "0" }, + }) + .step("verify-edit-types", { + type: "deterministic", + dependsOn: ["edit-types"], + command: `cd ${TRAJ_ROOT} && if git diff --quiet src/core/types.ts; then echo NOT_MODIFIED; exit 1; fi && grep -q "workflowId" src/core/types.ts && echo OK`, + failOnError: true, + captureOutput: true, + }) + + .step("edit-client", { + agent: "impl", + dependsOn: ["verify-edit-types", "read-client"], + task: `Edit ${TRAJ_ROOT}/src/sdk/client.ts. + +Current contents: +{{steps.read-client.output}} + +Two changes: + +1. In TrajectoryClient.start() (or wherever a new Trajectory is minted), read process.env.TRAJECTORIES_WORKFLOW_ID. If non-empty, stamp the trajectory's workflowId field. If unset/empty, leave workflowId undefined. Also accept an explicit workflowId option on start() that takes precedence over the env var. + +2. Export a new top-level helper from this file: + export async function compactWorkflow( + workflowId: string, + options?: { markdown?: boolean; mechanical?: boolean; cwd?: string } + ): Promise<{ compactedPath: string; markdownPath?: string }> + + Behavior: + - Spawn the trail CLI via child_process.spawn. Resolve the binary by: + a. process.env.TRAJECTORIES_CLI if set + b. otherwise require.resolve("agent-trajectories/package.json") + bin lookup → dist/cli/index.js invoked with process.execPath + c. fallback: "trail" on PATH + - Args: ["compact", "--workflow", workflowId, "--all"] plus "--markdown" and/or "--mechanical" when options set + - Pipe stderr through to the caller's stderr + - On non-zero exit: throw Error("compactWorkflow failed: " + stderr) + - Parse stdout for the compacted file paths (the CLI already logs them) + - DO NOT re-implement compaction logic in the SDK. Shelling out is the entire point. + +Only edit src/sdk/client.ts. If sdk/index.ts needs re-exporting compactWorkflow, that will be a separate step.`, + verification: { type: "exit_code", value: "0" }, + }) + .step("verify-edit-client", { + type: "deterministic", + dependsOn: ["edit-client"], + command: `cd ${TRAJ_ROOT} && if git diff --quiet src/sdk/client.ts; then echo NOT_MODIFIED; exit 1; fi && grep -Eq "TRAJECTORIES_WORKFLOW_ID" src/sdk/client.ts && grep -q "compactWorkflow" src/sdk/client.ts && echo OK`, + failOnError: true, + captureOutput: true, + }) + + .step("edit-sdk-index", { + agent: "impl", + dependsOn: ["verify-edit-client", "read-sdk-index"], + task: `Edit ${TRAJ_ROOT}/src/sdk/index.ts. + +Current contents: +{{steps.read-sdk-index.output}} + +Add a re-export for compactWorkflow so users can \`import { compactWorkflow } from "agent-trajectories/sdk"\`. If src/sdk/index.ts does not exist, create it and re-export the same symbols the existing entrypoint exports plus compactWorkflow. + +Only edit this one file.`, + verification: { type: "exit_code", value: "0" }, + }) + .step("verify-edit-sdk-index", { + type: "deterministic", + dependsOn: ["edit-sdk-index"], + command: `cd ${TRAJ_ROOT} && test -f src/sdk/index.ts && grep -q "compactWorkflow" src/sdk/index.ts && echo OK`, + failOnError: true, + captureOutput: true, + }) + + .step("edit-provider", { + agent: "impl", + dependsOn: ["verify-edit-sdk-index", "read-provider"], + task: `Edit ${TRAJ_ROOT}/src/compact/provider.ts. Two coupled changes — make the CLI provider the preferred default AND expand the set of supported CLIs so users with gemini or opencode (not just claude/codex) also get zero-config compaction. + +Current contents: +{{steps.read-provider.output}} + +Change 1 — provider resolution order (no API key required): +- In resolveProvider() (the entry point that reads TRAJECTORIES_LLM_PROVIDER and handles the "auto" default): + * When the effective provider is "auto" (unset or explicitly "auto"), TRY resolveCLIProvider() FIRST. + * If a CLI is found, return it immediately — do NOT check OPENAI_API_KEY / ANTHROPIC_API_KEY. + * Only fall back to API-key providers (OpenAI / Anthropic) when no supported CLI is installed. + * Keep explicit "openai" / "anthropic" / "cli" selections working unchanged. + +Change 2 — expand SUPPORTED_CLIS and honor TRAJECTORIES_LLM_CLI: +- Update the SUPPORTED_CLIS constant from ["claude", "codex"] to ["claude", "codex", "gemini", "opencode"]. Preference order = array order. Update the SupportedCli type alias accordingly. +- Extend buildCliArgs(cli) to return correct one-shot invocation args for each CLI: + claude → ["-p", "--output-format", "text"] (keep existing) + codex → ["exec", "--no-color"] (keep existing) + gemini → ["-p"] (gemini CLI one-shot prompt flag) + opencode → ["run", "--no-color"] (opencode one-shot run subcommand) + If the existing claude/codex args differ from the above, keep the existing ones — do not change what already works. +- In resolveCLIProvider(), honor a new env var TRAJECTORIES_LLM_CLI: + * If set to one of the supported values, only try that CLI (skip others). + * If unset, iterate SUPPORTED_CLIS in declaration order. + * If set to an unsupported value, log a warning to stderr and fall through to auto-detect. +- Add a JSDoc comment above SUPPORTED_CLIS: "Kept inline (not imported from @agent-relay/sdk) because that dep was removed in 7e9783c. When agent-relay ships new compaction-capable CLIs, add them here manually." + +Do NOT change the CLIProvider class, findBinary(), or CLI_SEARCH_PATHS. Only touch resolveProvider + resolveCLIProvider + SUPPORTED_CLIS + SupportedCli + buildCliArgs. + +Only edit this one file.`, + verification: { type: "exit_code", value: "0" }, + }) + .step("verify-edit-provider", { + type: "deterministic", + dependsOn: ["edit-provider"], + command: `cd ${TRAJ_ROOT} && if git diff --quiet src/compact/provider.ts; then echo NOT_MODIFIED; exit 1; fi && \ +grep -q "TRAJECTORIES_LLM_CLI" src/compact/provider.ts && \ +grep -q '"gemini"' src/compact/provider.ts && \ +grep -q '"opencode"' src/compact/provider.ts && \ +echo OK`, + failOnError: true, + captureOutput: true, + }) + + .step("edit-schema-lenient", { + agent: "impl", + dependsOn: ["verify-edit-provider", "read-schema"], + task: `Edit ${TRAJ_ROOT}/src/core/schema.ts to make TrajectoryEventTypeSchema permissive so trajectories emitted by other tools (notably agent-relay, which adds event types like "completion-evidence" and "completion-marker") can be loaded by trail compact without being entirely rejected. + +Current contents: +{{steps.read-schema.output}} + +Change: update TrajectoryEventTypeSchema from a strict z.enum([...]) to a permissive union matching the pattern already used by TaskSourceSystemSchema at the top of this file: + + export const TrajectoryEventTypeSchema = z.union([ + z.literal("prompt"), + z.literal("thinking"), + z.literal("tool_call"), + z.literal("tool_result"), + z.literal("message_sent"), + z.literal("message_received"), + z.literal("decision"), + z.literal("finding"), + z.literal("reflection"), + z.literal("note"), + z.literal("error"), + z.string(), // Allow event types emitted by other tools (e.g. agent-relay's completion-evidence / completion-marker). Downstream code filters to known types. + ]); + +Keep TrajectoryEvent TypeScript type in src/core/types.ts unchanged — new code continues to use the strict union. This change only affects what validateTrajectory() will ACCEPT from disk, not what we write. + +Add a one-line JSDoc above the schema explaining the permissive design. + +Only edit this one file.`, + verification: { type: "exit_code", value: "0" }, + }) + .step("verify-edit-schema-lenient", { + type: "deterministic", + dependsOn: ["edit-schema-lenient"], + command: `cd ${TRAJ_ROOT} && if git diff --quiet src/core/schema.ts; then echo NOT_MODIFIED; exit 1; fi && grep -q "TrajectoryEventTypeSchema" src/core/schema.ts && grep -A 20 "TrajectoryEventTypeSchema" src/core/schema.ts | grep -q "z.string()" && echo OK`, + failOnError: true, + captureOutput: true, + }) + + .step("edit-compact-cmd", { + agent: "impl", + dependsOn: ["verify-edit-schema-lenient", "read-compact-cmd"], + task: `Edit ${TRAJ_ROOT}/src/cli/commands/compact.ts. + +Current contents (first 400 lines): +{{steps.read-compact-cmd.output}} + +Add a new selector flag \`--workflow \` alongside the existing --ids / --pr / --branch / --commits filters. When present: +- loadTrajectories() must filter to trajectories whose \`workflowId === id\` +- The output file name must be \`workflow-.json\` (and \`.md\` when --markdown). Place under .trajectories/compacted/ per existing convention. +- Surface the workflow id in any printed summary +- All other flags continue to work as before + +Only edit this one file. Keep existing behavior intact when --workflow is not used.`, + verification: { type: "exit_code", value: "0" }, + }) + .step("verify-edit-compact-cmd", { + type: "deterministic", + dependsOn: ["edit-compact-cmd"], + command: `cd ${TRAJ_ROOT} && if git diff --quiet src/cli/commands/compact.ts; then echo NOT_MODIFIED; exit 1; fi && grep -q -- "--workflow" src/cli/commands/compact.ts && echo OK`, + failOnError: true, + captureOutput: true, + }) + + // ── Phase 5: Typecheck ────────────────────────────────────────── + .step("typecheck", { + type: "deterministic", + dependsOn: ["verify-edit-compact-cmd"], + command: `cd ${TRAJ_ROOT} && npm run typecheck 2>&1 | tail -60; echo "EXIT: $?"`, + captureOutput: true, + failOnError: false, + }) + .step("fix-typecheck", { + agent: "impl", + dependsOn: ["typecheck"], + task: `If the typecheck output below shows errors (non-zero EXIT), fix them. If it shows EXIT: 0 and no errors, do nothing. + +Output: +{{steps.typecheck.output}} + +Only edit files you previously touched: src/core/types.ts, src/core/schema.ts, src/sdk/client.ts, src/sdk/index.ts, src/compact/provider.ts, src/cli/commands/compact.ts. Re-run \`npm run typecheck\` until it passes.`, + verification: { type: "exit_code", value: "0" }, + }) + .step("typecheck-final", { + type: "deterministic", + dependsOn: ["fix-typecheck"], + command: `cd ${TRAJ_ROOT} && npm run typecheck 2>&1 | tail -20`, + captureOutput: true, + failOnError: true, + }) + + // ── Phase 6: Tests (Claude writes, vitest runs) ────────────────── + .step("create-tests", { + agent: "tester", + dependsOn: ["typecheck-final"], + task: `Create ${TRAJ_ROOT}/tests/sdk/workflow-compact.test.ts using vitest (this project uses vitest — see package.json scripts). + +IMPORTANT — isolation strategy: +- The trajectories SDK writes to \`/.trajectories/\` by default. For isolation, each test must EITHER (a) process.chdir() into a tmp dir and restore the cwd in afterEach, OR (b) pass an explicit baseDir to TrajectoryClient if that option exists. Do NOT rely on an env var like TRAJECTORIES_DIR — the CLI and storage honor TRAJECTORIES_DATA_DIR / TRAJECTORIES_SEARCH_PATHS, not TRAJECTORIES_DIR. + +Cover five cases: + +1. TRAJECTORIES_WORKFLOW_ID env var stamps workflowId on trajectories created via TrajectoryClient.start(). Save + restore process.env. + +2. Without TRAJECTORIES_WORKFLOW_ID set, the created trajectory has workflowId === undefined. + +3. CLI filter end-to-end: cd into a tmp dir, create two trajectories, one with workflowId "wf-a", one without. Spawn the CLI via child_process.spawnSync from that cwd: \`npx tsx \${absoluteRepoRoot}/src/cli/index.ts compact --workflow wf-a --mechanical --all\`. Assert the compacted JSON file exists under \`/.trajectories/compacted/workflow-wf-a.json\` and its sourceTrajectories array contains only the tagged trajectory id. + +4. compactWorkflow() SDK helper end-to-end: cd into a tmp dir containing one tagged trajectory, call await compactWorkflow("wf-a", { mechanical: true, markdown: true }), assert the returned compactedPath exists on disk. + +5. Schema leniency: in a tmp cwd, write a raw trajectory JSON that includes an event with type "completion-evidence" (not one of the canonical event types) plus one normal "decision" event. Run trail compact via spawnSync. Assert the command exits 0 and produces a compacted file — i.e. the unknown event type does NOT cause the whole trajectory to be dropped. + +Write the file. Use absolute paths for the CLI entrypoint. Always cd back and clean up tmp dirs in afterEach/finally.`, + verification: { + type: "file_exists", + value: "tests/sdk/workflow-compact.test.ts", + }, + }) + .step("run-tests", { + type: "deterministic", + dependsOn: ["create-tests"], + command: `cd ${TRAJ_ROOT} && npx vitest run tests/sdk/workflow-compact.test.ts 2>&1 | tail -100; echo "EXIT: $?"`, + captureOutput: true, + failOnError: false, + }) + .step("fix-tests", { + agent: "tester", + dependsOn: ["run-tests"], + task: `If the test output below shows failures (non-zero EXIT or FAIL lines), fix them — could be a test bug or a source bug. If all passed, do nothing. + +Test output: +{{steps.run-tests.output}} + +Re-run \`npx vitest run tests/sdk/workflow-compact.test.ts\` until green. You may edit tests/sdk/workflow-compact.test.ts OR the impl files (src/sdk/client.ts, src/cli/commands/compact.ts, src/sdk/index.ts, src/compact/provider.ts, src/core/schema.ts, src/core/types.ts).`, + verification: { type: "exit_code", value: "0" }, + }) + .step("run-tests-final", { + type: "deterministic", + dependsOn: ["fix-tests"], + command: `cd ${TRAJ_ROOT} && npx vitest run tests/sdk/workflow-compact.test.ts 2>&1 | tail -80`, + captureOutput: true, + failOnError: true, + }) + + // ── Phase 7: AFTER capture ───────────────────────────────────── + .step("after-run", { + type: "deterministic", + dependsOn: ["run-tests-final"], + command: `cd ${TRAJ_ROOT}/.trajectories-test/after && TRAJECTORIES_WORKFLOW_ID=bench-after npx tsx ${TRAJ_ROOT}/scripts/benchmark-compaction.ts 2>&1 | tee run.log`, + captureOutput: true, + failOnError: true, + }) + .step("after-compact", { + type: "deterministic", + dependsOn: ["after-run"], + command: `cd ${TRAJ_ROOT}/.trajectories-test/after && npx tsx ${TRAJ_ROOT}/src/cli/index.ts compact --workflow bench-after --markdown --all 2>&1 | tee -a run.log && find .trajectories -type d -name compacted -exec ls -la {} \\; 2>&1 || true`, + captureOutput: true, + failOnError: true, + }) + .step("after-stats", { + type: "deterministic", + dependsOn: ["after-compact"], + command: `cd ${TRAJ_ROOT} && node -e ' +const fs = require("fs"); +const path = require("path"); +const root = ".trajectories-test/after"; +function walk(dir) { + const out = []; + if (!fs.existsSync(dir)) return out; + for (const e of fs.readdirSync(dir, { withFileTypes: true })) { + const p = path.join(dir, e.name); + if (e.isDirectory()) out.push(...walk(p)); + else if (e.name.endsWith(".json") || e.name.endsWith(".md")) out.push(p); + } + return out; +} +const files = walk(root); +const raw = files.find(f => f.endsWith(".json") && !f.includes("/compacted/")); +const compacted = files.find(f => f.includes("/compacted/") && f.endsWith(".json") && /workflow-bench-after/.test(f)); +const md = files.find(f => f.includes("/compacted/") && f.endsWith(".md") && /workflow-bench-after/.test(f)); +if (!raw) { console.error("NO_RAW"); process.exit(1); } +if (!compacted) { console.error("NO_COMPACTED"); process.exit(1); } +if (!md) { console.error("NO_MARKDOWN"); process.exit(1); } +const rawData = JSON.parse(fs.readFileSync(raw, "utf8")); +const compactedData = JSON.parse(fs.readFileSync(compacted, "utf8")); +let events = 0; +for (const c of rawData.chapters || []) events += (c.events || []).length; +const stats = { + raw_file: raw, + compacted_file: compacted, + md_file: md, + raw_bytes: fs.statSync(raw).size, + compacted_bytes: fs.statSync(compacted).size, + md_bytes: fs.statSync(md).size, + raw_event_count: events, + raw_workflow_id: rawData.workflowId || null, + has_narrative: typeof compactedData.narrative === "string" && compactedData.narrative.length > 0 ? 1 : 0, + has_decisions: Array.isArray(compactedData.decisions) && compactedData.decisions.length > 0 ? 1 : 0, + source_trajectory_count: Array.isArray(compactedData.sourceTrajectories) ? compactedData.sourceTrajectories.length : 0, +}; +fs.writeFileSync(".trajectories-test/after/stats.json", JSON.stringify(stats, null, 2)); +console.log(JSON.stringify(stats, null, 2)); +'`, + captureOutput: true, + failOnError: true, + }) + + // ── Phase 8: BEFORE/AFTER comparison gate ──────────────────────── + .step("before-after-gate", { + type: "deterministic", + dependsOn: ["after-stats", "before-stats"], + command: `cd ${TRAJ_ROOT} && node -e ' +const fs = require("fs"); +const before = JSON.parse(fs.readFileSync(".trajectories-test/before/stats.json", "utf8")); +const after = JSON.parse(fs.readFileSync(".trajectories-test/after/stats.json", "utf8")); +const failures = []; +if (before.has_workflow_id !== 0) failures.push("BEFORE trajectory already had workflowId — sample script was not using the pre-feature SDK"); +if (after.raw_workflow_id !== "bench-after") failures.push("AFTER raw trajectory missing workflowId=\\"bench-after\\", got " + JSON.stringify(after.raw_workflow_id)); +if (after.source_trajectory_count < 1) failures.push("AFTER compacted sourceTrajectories empty — --workflow filter selected nothing"); +if (after.compacted_bytes >= after.raw_bytes) failures.push("AFTER compacted (" + after.compacted_bytes + "B) not smaller than raw (" + after.raw_bytes + "B)"); +if (after.compacted_bytes > after.raw_bytes * 0.95) failures.push("AFTER compaction ratio too weak: " + (after.compacted_bytes / after.raw_bytes).toFixed(2) + " (want <= 0.95). Fixture may be too small — real trajectories with 200+ events compress much better."); +if (after.has_narrative < 1) failures.push("AFTER compacted missing narrative field"); +if (after.has_decisions < 1) failures.push("AFTER compacted missing decisions field"); +if (after.md_bytes < 200) failures.push("AFTER markdown too short: " + after.md_bytes + " bytes"); +console.log("BEFORE:", JSON.stringify(before)); +console.log("AFTER:", JSON.stringify(after)); +if (failures.length) { + console.error("\\nBEFORE/AFTER GATE FAILED:"); + for (const f of failures) console.error(" - " + f); + process.exit(1); +} +const ratio = (100 * after.compacted_bytes / after.raw_bytes).toFixed(0); +console.log("\\nBEFORE/AFTER GATE PASSED"); +console.log("raw: " + after.raw_bytes + "B, compacted: " + after.compacted_bytes + "B (" + ratio + "% of raw), markdown: " + after.md_bytes + "B"); +'`, + captureOutput: true, + failOnError: true, + }) + + // ── Phase 9: Self-review + Peer review ────────────────────────── + .step("capture-diff", { + type: "deterministic", + dependsOn: ["before-after-gate"], + command: `cd ${TRAJ_ROOT} && git diff src/core/types.ts src/core/schema.ts src/sdk/client.ts src/sdk/index.ts src/compact/provider.ts src/cli/commands/compact.ts tests/sdk/workflow-compact.test.ts scripts/benchmark-compaction.ts 2>&1 | head -900`, + captureOutput: true, + failOnError: false, + }) + .step("self-review", { + agent: "self-reviewer", + dependsOn: ["capture-diff"], + task: `You wrote this implementation. Self-review the diff for correctness, edge cases, and adherence to the brief. + +Diff: +{{steps.capture-diff.output}} + +Checklist: +- TrajectoryClient actually reads process.env.TRAJECTORIES_WORKFLOW_ID +- compactWorkflow() shells out to the local CLI, never re-implementing compaction +- trail compact --workflow filters by workflowId and produces workflow-.{json,md} +- Obvious bugs, missing null checks, hardcoded paths, dropped errors? + +Write findings to ${TRAJ_ROOT}/.trajectories-test/self-review.md. If all clear, the file must contain the single line "SELF_REVIEW_CLEAR" on its own line. Otherwise list concrete issues.`, + verification: { + type: "file_exists", + value: ".trajectories-test/self-review.md", + }, + }) + .step("peer-review", { + agent: "reviewer", + dependsOn: ["capture-diff"], + task: `Peer-review the implementation diff below. You did NOT write it. + +Diff: +{{steps.capture-diff.output}} + +Focus on: +1. Single source of truth: SDK MUST shell out to trail compact, never duplicate compaction logic inline +2. Env-var semantics: TRAJECTORIES_WORKFLOW_ID is the implicit tagging mechanism; explicit option on start() takes precedence +3. Backwards compatibility: existing trail compact calls without --workflow still work unchanged +4. Test coverage: do the tests hit the REAL CLI via spawnSync, not a mock? +5. Binary resolution: does compactWorkflow() find the trail binary robustly (env var, bin lookup, PATH fallback)? +6. No API key required: with "auto" provider and NO OPENAI/ANTHROPIC env vars set, does compaction still succeed by picking any installed CLI? +7. SUPPORTED_CLIS covers claude, codex, gemini, opencode — and buildCliArgs() has a case for each that is a valid one-shot invocation, not a stub? +8. TRAJECTORIES_LLM_CLI override honored when set to any of the four supported values; unsupported values log a warning and fall through? +9. TrajectoryEventTypeSchema is now a permissive union: trajectories containing unknown event types (e.g. completion-evidence) parse successfully; the whole trajectory is NOT dropped on validation. Test #5 covers this. + +Write findings to ${TRAJ_ROOT}/.trajectories-test/peer-review.md. End with "PEER_REVIEW_APPROVED" on its own line if acceptable. Otherwise list blocking issues (one per line) before any approval line.`, + verification: { + type: "file_exists", + value: ".trajectories-test/peer-review.md", + }, + }) + + .step("address-review", { + agent: "impl", + dependsOn: ["self-review", "peer-review"], + task: `Read both review files. Address any blocking issues. If both end with SELF_REVIEW_CLEAR / PEER_REVIEW_APPROVED and no blocking items, do nothing. + +Self-review: ${TRAJ_ROOT}/.trajectories-test/self-review.md +Peer-review: ${TRAJ_ROOT}/.trajectories-test/peer-review.md + +Only edit: src/core/types.ts, src/core/schema.ts, src/sdk/client.ts, src/sdk/index.ts, src/compact/provider.ts, src/cli/commands/compact.ts, tests/sdk/workflow-compact.test.ts, scripts/benchmark-compaction.ts. + +After your edits, re-run both: + npm run typecheck + npx vitest run tests/sdk/workflow-compact.test.ts +until both pass.`, + verification: { type: "exit_code", value: "0" }, + }) + + // ── Phase 10: Final gates ────────────────────────────────────── + .step("tests-after-review", { + type: "deterministic", + dependsOn: ["address-review"], + command: `cd ${TRAJ_ROOT} && npx vitest run tests/sdk/workflow-compact.test.ts 2>&1 | tail -80`, + captureOutput: true, + failOnError: true, + }) + .step("typecheck-after-review", { + type: "deterministic", + dependsOn: ["address-review"], + command: `cd ${TRAJ_ROOT} && npm run typecheck 2>&1 | tail -20`, + captureOutput: true, + failOnError: true, + }) + .step("regression-tests", { + type: "deterministic", + dependsOn: ["tests-after-review", "typecheck-after-review"], + command: `cd ${TRAJ_ROOT} && npm run test:run 2>&1 | tail -80; echo "REG_EXIT: $?"`, + captureOutput: true, + failOnError: false, + }) + .step("fix-regressions", { + agent: "impl", + dependsOn: ["regression-tests"], + task: `If existing tests broke (non-zero REG_EXIT or FAIL lines), fix only the regressions caused by our changes. If all passed, do nothing. + +Regression output: +{{steps.regression-tests.output}} + +Re-run \`npm run test:run\` until green.`, + verification: { type: "exit_code", value: "0" }, + }) + .step("regression-final", { + type: "deterministic", + dependsOn: ["fix-regressions"], + command: `cd ${TRAJ_ROOT} && npm run test:run 2>&1 | tail -40`, + captureOutput: true, + failOnError: true, + }) + + // ── Phase 11: Commit (deterministic) ──────────────────────────── + .step("commit", { + type: "deterministic", + dependsOn: ["regression-final"], + command: `cd ${TRAJ_ROOT} && git add src/core/types.ts src/core/schema.ts src/sdk/client.ts src/sdk/index.ts src/compact/provider.ts src/cli/commands/compact.ts tests/sdk/workflow-compact.test.ts scripts/benchmark-compaction.ts workflows/sdk-workflow-autocompact.ts && git commit -m "feat: workflow-aware auto-compaction (SDK tag + trail compact --workflow) + +Compaction stays in one place — the local trail CLI. The SDK only +tags trajectories and shells out. No API key is ever required: the +CLI provider (claude or codex, already installed and authenticated) +is the default, with API providers only used on explicit opt-in. + +- Trajectory gains an optional workflowId field +- TrajectoryClient stamps workflowId from TRAJECTORIES_WORKFLOW_ID env +- New SDK helper compactWorkflow() spawns trail compact --workflow +- trail compact --workflow filter selects trajectories by run +- Output: .trajectories/compacted/workflow-.{json,md} +- resolveProvider() now prefers the CLI provider in auto mode +- SUPPORTED_CLIS expanded to claude, codex, gemini, opencode +- buildCliArgs() extended with one-shot invocations for gemini + opencode +- TRAJECTORIES_LLM_CLI env var pins which CLI to use when multiple installed +- TrajectoryEventTypeSchema made permissive: accepts unknown event types + from other tools (e.g. agent-relay's completion-evidence) instead of + dropping the whole trajectory on validation failure +- E2E before/after benchmark script + vitest coverage + +When invoked under a relay workflow that sets TRAJECTORIES_WORKFLOW_ID, +the produced trajectory for the workflow run is collated and compacted +into a single tight artifact with narrative + decisions + lessons." 2>&1`, + captureOutput: true, + failOnError: true, + }) + .step("push", { + type: "deterministic", + dependsOn: ["commit"], + command: `cd ${TRAJ_ROOT} && branch=$(git rev-parse --abbrev-ref HEAD) && echo "Pushing $branch to origin..." && git push origin "$branch" 2>&1`, + captureOutput: true, + failOnError: true, + }) + .step("comment-pr", { + type: "deterministic", + dependsOn: ["push"], + command: `cd ${TRAJ_ROOT} && branch=$(git rev-parse --abbrev-ref HEAD) && pr_number=$(gh pr list --head "$branch" --json number --jq '.[0].number' 2>/dev/null) && if [ -n "$pr_number" ]; then + before=$(cat .trajectories-test/before/stats.json) + after=$(cat .trajectories-test/after/stats.json) + raw_bytes=$(node -e "console.log(require('./.trajectories-test/after/stats.json').raw_bytes)") + compacted_bytes=$(node -e "console.log(require('./.trajectories-test/after/stats.json').compacted_bytes)") + ratio=$(node -e "const s=require('./.trajectories-test/after/stats.json'); console.log(((100*s.compacted_bytes)/s.raw_bytes).toFixed(0))") + gh pr comment "$pr_number" --body "## sdk-workflow-autocompact — before/after + +Workflow ran end-to-end and passed the hard gate. + +**Before** (raw trajectory, pre-feature SDK): +\\\`\\\`\\\`json +$before +\\\`\\\`\\\` + +**After** (same benchmark with TRAJECTORIES_WORKFLOW_ID=bench-after, compacted via \\\`trail compact --workflow\\\`): +\\\`\\\`\\\`json +$after +\\\`\\\`\\\` + +**Result**: raw $raw_bytes B → compacted $compacted_bytes B ($ratio% of raw), markdown generated, narrative + decisions populated. + +No API key was required — compaction used the local CLI provider (claude/codex/gemini/opencode). +" 2>&1 +else + echo "No open PR for branch $branch — skipping comment" +fi`, + captureOutput: true, + failOnError: false, + }) + .step("print-summary", { + type: "deterministic", + dependsOn: ["comment-pr"], + command: `cd ${TRAJ_ROOT} && echo "=== COMMIT ===" && git log -1 --oneline && echo "=== PR ===" && (gh pr list --head "$(git rev-parse --abbrev-ref HEAD)" --json number,url --jq '.[0]' || true) && echo "=== BEFORE STATS ===" && cat .trajectories-test/before/stats.json && echo "=== AFTER STATS ===" && cat .trajectories-test/after/stats.json`, + captureOutput: true, + failOnError: false, + }) + + .onError("retry", { maxRetries: 1, retryDelayMs: 10_000 }) + .run({ cwd: process.cwd() }); + + console.log("Workflow status:", result.status); +} + +runWorkflow().catch((error) => { + console.error(error); + process.exit(1); +});