diff --git a/.changeset/native-windows-support.md b/.changeset/native-windows-support.md new file mode 100644 index 0000000..1d6f7bc --- /dev/null +++ b/.changeset/native-windows-support.md @@ -0,0 +1,19 @@ +--- +'colonyq': patch +--- + +Native Windows support for the `colony` CLI. The bin entry was a POSIX shell +script (`bin/colony.sh`) that npm could not execute on Windows without WSL, +breaking every Windows install of the package. The shim is now a Node ES +module (`bin/colony.mjs`) using only `node:*` builtins, so npm's generated +`.cmd` / `.ps1` wrappers run it natively under cmd, PowerShell, and Git Bash. + +The daemon fast-path for `colony bridge lifecycle --json` is preserved — the +HTTP POST to `127.0.0.1:$COLONY_WORKER_PORT/api/bridge/lifecycle` now goes +through `node:http`, with a `node:net` connect probe (1s) before the request +(2s) so the fallback latency stays close to the curl-based version when the +daemon isn't running. Stdin is buffered and replayed on fallback, preserving +rule #10 (a dead daemon must never lose or block a write). + +CI now runs the build matrix on `ubuntu-latest`, `macos-latest`, and +`windows-latest` across Node 20 and 22 so this regression cannot recur. diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 186cafe..0e03603 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -19,9 +19,11 @@ jobs: # The check fires automatically when the PR is marked ready for review # (the `ready_for_review` trigger above), and always on push to main. if: github.event_name != 'pull_request' || github.event.pull_request.draft == false - runs-on: ubuntu-latest + runs-on: ${{ matrix.os }} strategy: + fail-fast: false matrix: + os: [ubuntu-latest, macos-latest, windows-latest] node: ['20', '22'] steps: - uses: actions/checkout@v4 diff --git a/CLAUDE.md b/CLAUDE.md index dc874c0..3514e90 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -26,7 +26,7 @@ The signature property of the project is that **memory is stored compressed**. E 9. **No silent failures.** Hook and worker errors are logged as structured JSON; user-visible commands surface failures with a non-zero exit code and a short message. 10. **Writes never depend on the daemon being up.** Hooks must always complete the write before returning, and a dead worker must never lose or block a write. Two paths satisfy this: - **Native colony hook handlers** (`colony hook run pre-tool-use`, etc.) write observations synchronously through `MemoryStore.addObservation` in the same process. No IPC, no network. - - **The OMX lifecycle bridge** (`colony bridge lifecycle`, called by external integrations like oh-my-codex) takes a fast path through the worker daemon at `POST /api/bridge/lifecycle` when it is running and reachable within ~2s. On any failure (curl missing, daemon down, non-200, timeout, unknown flags, or invocation without `--json`), the wrapper at `apps/cli/bin/colony.sh` buffers stdin to a temp file and falls back to invoking the Node CLI in-process — same write path as before, identical SQLite file. The contract is regression-tested in `apps/cli/test/bin-shim.test.ts`. + - **The OMX lifecycle bridge** (`colony bridge lifecycle`, called by external integrations like oh-my-codex) takes a fast path through the worker daemon at `POST /api/bridge/lifecycle` when it is running and reachable within ~2s. On any failure (daemon down, non-200, timeout, unknown flags, or invocation without `--json`), the cross-platform Node shim at `apps/cli/bin/colony.mjs` buffers stdin and falls back to invoking the CLI in-process — same write path as before, identical SQLite file. The contract is regression-tested in `apps/cli/test/bin-shim.test.ts`. Hooks may *detach-spawn* the worker to kick off background embedding, but they must never wait on it. If the worker is down, writes still succeed; only the semantic-search side is degraded (BM25 keeps working). 11. **Read before edit tools.** Claude Code rejects `Edit` / `Update` / `MultiEdit` on an existing file unless that exact file path was read first in the current session. Before any edit tool call, run `Read` on the target file with the same relative path you will edit. diff --git a/apps/cli/bin/colony.mjs b/apps/cli/bin/colony.mjs new file mode 100755 index 0000000..1d99151 --- /dev/null +++ b/apps/cli/bin/colony.mjs @@ -0,0 +1,219 @@ +#!/usr/bin/env node +// Colony CLI bin shim with daemon fast-path for `colony bridge lifecycle`. +// +// Why: every IDE tool event fires `colony bridge lifecycle ...` from external +// hook integrations (oh-my-codex's ColonyBridge.spawnSync, Codex/Claude Code +// settings). Cold-starting Node on each event pegs ~one core for ~300 ms. +// Multiplied across concurrent agents this is a measurable CPU storm. When +// the worker daemon is running, we POST the envelope to /api/bridge/lifecycle +// and skip the rest of the CLI bootstrap entirely. +// +// Rules: +// - Only `bridge lifecycle --json` is fast-pathed. Everything else falls +// through to the in-process CLI so behavior is unchanged. +// - Daemon unreachable / errored / unknown flags / missing --json / trailing +// positional args ⇒ fall back to the in-process CLI with stdin intact +// (we buffer it so it can be replayed). +// - Pure node:* builtins so the same shim runs on Linux, macOS, and Windows +// (cmd, PowerShell, Git Bash) — no curl, no /bin/sh. + +import { connect } from 'node:net'; +import { request } from 'node:http'; +import { dirname, isAbsolute, resolve } from 'node:path'; +import { Readable } from 'node:stream'; +import { fileURLToPath, pathToFileURL } from 'node:url'; + +const HERE = dirname(fileURLToPath(import.meta.url)); +const CLI_ENTRY = (() => { + // COLONY_CLI_ENTRY is a test-only seam: the bin-shim tests point it at a + // stub so they can assert on argv/stdin replay without booting the real CLI. + const override = process.env.COLONY_CLI_ENTRY; + if (override) return isAbsolute(override) ? override : resolve(HERE, '..', override); + return resolve(HERE, '..', 'dist', 'index.js'); +})(); + +const fastEnv = (process.env.COLONY_BRIDGE_FAST ?? '1').toLowerCase(); +const FAST_DISABLED = + fastEnv === '0' || fastEnv === 'false' || fastEnv === 'no' || fastEnv === 'off'; + +const PORT = Number(process.env.COLONY_WORKER_PORT ?? 37777); +const HOST = '127.0.0.1'; +// Match the curl-based shell version: --connect-timeout 1, --max-time 2. +const CONNECT_TIMEOUT_MS = 1000; +const REQUEST_TIMEOUT_MS = 2000; + +const argv = process.argv.slice(2); + +await main().catch((err) => { + process.stderr.write(`colony: ${err instanceof Error ? err.message : String(err)}\n`); + process.exit(1); +}); + +async function main() { + // Non-fast-path-eligible commands take the unchanged CLI path immediately. + if (FAST_DISABLED || argv[0] !== 'bridge' || argv[1] !== 'lifecycle') { + await runCli(argv, null); + return; + } + + const parsed = parseBridgeLifecycleFlags(argv.slice(2)); + + // Bail on unknown flags, missing --json (humans want pretty output), or + // trailing positional args we don't know how to forward. Same triage as + // the legacy shell shim. + if (!parsed.ok || !parsed.json || parsed.rest.length > 0) { + await runCli(rebuildSafeArgv(parsed), null); + return; + } + + const body = await readAllStdin(); + const served = await tryDaemon({ ide: parsed.ide, cwd: parsed.cwd, body }); + if (served) return; + + // Daemon unreachable or non-200 — fall back to the in-process CLI with the + // buffered envelope replayed on stdin. + await runCli(rebuildSafeArgv(parsed), body); +} + +function parseBridgeLifecycleFlags(rest) { + const out = { ok: true, json: false, ide: '', cwd: '', rest: [] }; + let i = 0; + while (i < rest.length) { + const a = rest[i]; + if (a === '--json') { + out.json = true; + i += 1; + continue; + } + if (a === '--ide') { + out.ide = rest[i + 1] ?? ''; + i += 2; + continue; + } + if (a.startsWith('--ide=')) { + out.ide = a.slice('--ide='.length); + i += 1; + continue; + } + if (a === '--cwd') { + out.cwd = rest[i + 1] ?? ''; + i += 2; + continue; + } + if (a.startsWith('--cwd=')) { + out.cwd = a.slice('--cwd='.length); + i += 1; + continue; + } + if (a === '--') { + out.rest = rest.slice(i + 1); + break; + } + out.ok = false; + out.rest = rest.slice(i); + break; + } + return out; +} + +function rebuildSafeArgv(parsed) { + const out = ['bridge', 'lifecycle']; + if (parsed.json) out.push('--json'); + if (parsed.ide) out.push('--ide', parsed.ide); + if (parsed.cwd) out.push('--cwd', parsed.cwd); + return out; +} + +function readAllStdin() { + return new Promise((resolveOuter, rejectOuter) => { + if (process.stdin.isTTY) { + resolveOuter(Buffer.alloc(0)); + return; + } + const chunks = []; + process.stdin.on('data', (c) => chunks.push(c)); + process.stdin.on('end', () => resolveOuter(Buffer.concat(chunks))); + process.stdin.on('error', rejectOuter); + }); +} + +function probeDaemon() { + return new Promise((resolveOuter) => { + const socket = connect({ port: PORT, host: HOST }); + let settled = false; + const finish = (ok) => { + if (settled) return; + settled = true; + socket.destroy(); + resolveOuter(ok); + }; + socket.setTimeout(CONNECT_TIMEOUT_MS); + socket.once('connect', () => finish(true)); + socket.once('error', () => finish(false)); + socket.once('timeout', () => finish(false)); + }); +} + +async function tryDaemon({ ide, cwd, body }) { + if (!(await probeDaemon())) return false; + return new Promise((resolveOuter) => { + const req = request( + { + host: HOST, + port: PORT, + method: 'POST', + path: '/api/bridge/lifecycle', + headers: { + 'content-type': 'application/json', + 'content-length': body.length, + 'x-colony-ide': ide, + 'x-colony-cwd': cwd, + }, + timeout: REQUEST_TIMEOUT_MS, + }, + (res) => { + const chunks = []; + res.on('data', (c) => chunks.push(c)); + res.on('end', () => { + if (res.statusCode === 200) { + process.stdout.write(Buffer.concat(chunks)); + resolveOuter(true); + } else { + resolveOuter(false); + } + }); + res.on('error', () => resolveOuter(false)); + }, + ); + req.on('error', () => resolveOuter(false)); + req.on('timeout', () => { + req.destroy(); + resolveOuter(false); + }); + req.write(body); + req.end(); + }); +} + +async function runCli(args, stdinBuffer) { + // Make isMainEntry() in dist/index.js succeed when we dynamic-import it: + // it compares import.meta.url against the realpath of process.argv[1]. + // Pointing argv[1] at the resolved CLI entry makes the in-process import + // behave exactly like a direct `node dist/index.js` invocation. + process.argv = [process.argv[0], CLI_ENTRY, ...args]; + if (stdinBuffer && stdinBuffer.length > 0) { + installReplayStdin(stdinBuffer); + } + await import(pathToFileURL(CLI_ENTRY).href); +} + +function installReplayStdin(buf) { + const replay = Readable.from([buf]); + // Preserve a few properties consumers may sniff on process.stdin. + Object.assign(replay, { isTTY: false, fd: 0 }); + Object.defineProperty(process, 'stdin', { + value: replay, + configurable: true, + writable: true, + }); +} diff --git a/apps/cli/package.json b/apps/cli/package.json index 08f97b2..9a688f0 100644 --- a/apps/cli/package.json +++ b/apps/cli/package.json @@ -29,10 +29,10 @@ }, "type": "module", "bin": { - "colony": "bin/colony.sh" + "colony": "bin/colony.mjs" }, "main": "./dist/index.js", - "files": ["bin", "dist", "hooks-scripts", "README.md", "LICENSE"], + "files": ["bin/colony.mjs", "dist", "hooks-scripts", "README.md", "LICENSE"], "scripts": { "build": "tsup", "dev": "tsup --watch --onSuccess \"node dist/index.js\"", diff --git a/apps/cli/test/bin-shim.test.ts b/apps/cli/test/bin-shim.test.ts index e67bb47..c045f0f 100644 --- a/apps/cli/test/bin-shim.test.ts +++ b/apps/cli/test/bin-shim.test.ts @@ -1,18 +1,23 @@ import { spawnSync } from 'node:child_process'; import { chmodSync, mkdtempSync, readFileSync, rmSync, statSync, writeFileSync } from 'node:fs'; -import { tmpdir } from 'node:os'; +import { tmpdir, platform } from 'node:os'; import { dirname, join, resolve } from 'node:path'; import { fileURLToPath } from 'node:url'; import { afterEach, beforeEach, describe, expect, it } from 'vitest'; -// Locking the wrapper's behavior at the shell level matters because rule #10 -// in CLAUDE.md ("hooks never wait on, never lose writes to, a daemon that may -// be down") is enforced by the wrapper, not by the worker. If the wrapper -// stops falling back to in-process Node when the daemon is unreachable, -// writes get silently dropped on the floor. +// Locking the wrapper's behavior matters because rule #10 in CLAUDE.md +// ("hooks never wait on, never lose writes to, a daemon that may be down") +// is enforced by the wrapper, not by the worker. If the wrapper stops +// falling back to the in-process CLI when the daemon is unreachable, writes +// get silently dropped on the floor. +// +// The shim used to be a POSIX shell script (`bin/colony.sh`) and these tests +// spawned it through `sh`. It is now a Node ES module so the same shim runs +// on Windows, macOS, and Linux — the tests drive it via `node bin/colony.mjs`. const HERE = dirname(fileURLToPath(import.meta.url)); -const SHIM = resolve(HERE, '..', 'bin', 'colony.sh'); +const SHIM = resolve(HERE, '..', 'bin', 'colony.mjs'); +const IS_WINDOWS = platform() === 'win32'; function freeUnusedPort(): string { // Port 1 is reserved/privileged on Linux. Connecting to it from a @@ -30,13 +35,16 @@ interface ShimRun { function runShim( args: string[], - opts: { stdin?: string; env?: NodeJS.ProcessEnv; nodeStub: string; logFile: string }, + opts: { stdin?: string; env?: NodeJS.ProcessEnv; cliStub: string; logFile: string }, ): ShimRun { - const result = spawnSync('sh', [SHIM, ...args], { + const result = spawnSync(process.execPath, [SHIM, ...args], { input: opts.stdin ?? '', env: { ...process.env, - PATH: `${dirname(opts.nodeStub)}:${process.env.PATH ?? ''}`, + // The shim resolves its CLI target via this env var so tests don't have + // to build dist/ to exercise the dispatch logic. + COLONY_CLI_ENTRY: opts.cliStub, + COLONY_STUB_LOG: opts.logFile, ...(opts.env ?? {}), }, encoding: 'utf8', @@ -58,34 +66,36 @@ function existsOrEmpty(path: string): string { } } -describe('bin/colony.sh', () => { +describe('bin/colony.mjs', () => { let dir: string; - let stubNode: string; + let cliStub: string; let stubLog: string; beforeEach(() => { dir = mkdtempSync(join(tmpdir(), 'colony-shim-')); - stubNode = join(dir, 'node'); + cliStub = join(dir, 'cli-stub.mjs'); stubLog = join(dir, 'stub.log'); - // Stub `node`: record argv (one per line, $@ expanded with newlines) - // and stdin so the test can assert on both. Exit 0 so `set -e` in the - // wrapper does not propagate a stub-driven failure. + // The shim dynamic-imports CLI_ENTRY in the same process. The stub + // records argv (one arg per line) and the buffered stdin so tests can + // assert on both, then exits cleanly. writeFileSync( - stubNode, + cliStub, [ - '#!/bin/sh', - `LOG="${stubLog}"`, - 'echo "ARGV_BEGIN" >>"$LOG"', - 'for a in "$@"; do echo "$a" >>"$LOG"; done', - 'echo "ARGV_END" >>"$LOG"', - 'echo "STDIN_BEGIN" >>"$LOG"', - 'cat >>"$LOG"', - 'echo "" >>"$LOG"', - 'echo "STDIN_END" >>"$LOG"', - 'exit 0', + "import { appendFileSync } from 'node:fs';", + "const log = process.env.COLONY_STUB_LOG;", + "appendFileSync(log, 'ARGV_BEGIN\\n');", + "for (const a of process.argv.slice(2)) appendFileSync(log, a + '\\n');", + "appendFileSync(log, 'ARGV_END\\n');", + "appendFileSync(log, 'STDIN_BEGIN\\n');", + "let buf = '';", + "process.stdin.setEncoding('utf8');", + "process.stdin.on('data', (c) => { buf += c; });", + "process.stdin.on('end', () => {", + " appendFileSync(log, buf + '\\nSTDIN_END\\n');", + "});", + "", ].join('\n'), ); - chmodSync(stubNode, 0o755); }); afterEach(() => { @@ -94,19 +104,25 @@ describe('bin/colony.sh', () => { it('exists and is executable when packaged', () => { const stat = statSync(SHIM); - // Owner exec bit. npm pack preserves the executable bit when packaging, - // so this is what e2e-publish.sh ends up installing as $PREFIX/bin/colony. - expect(stat.mode & 0o100).toBeTruthy(); + if (IS_WINDOWS) { + // Windows has no POSIX exec bit; npm generates `.cmd`/`.ps1` wrappers + // that invoke `node bin\\colony.mjs`. We just assert the file is there. + expect(stat.isFile()).toBe(true); + } else { + // Owner exec bit. npm pack preserves the executable bit when packaging, + // so this is what e2e-publish.sh ends up installing as $PREFIX/bin/colony. + expect(stat.mode & 0o100).toBeTruthy(); + } }); - it('falls back to Node when the daemon is unreachable, with stdin and args intact (rule-10 contract)', () => { + it('falls back to the CLI when the daemon is unreachable, with stdin and args intact (rule-10 contract)', () => { const envelope = '{"event_id":"e_test_1","event_name":"pre_tool_use"}'; const result = runShim( ['bridge', 'lifecycle', '--json', '--ide', 'claude-code', '--cwd', '/tmp/has spaces'], { stdin: envelope, env: { COLONY_WORKER_PORT: freeUnusedPort() }, - nodeStub: stubNode, + cliStub, logFile: stubLog, }, ); @@ -128,7 +144,7 @@ describe('bin/colony.sh', () => { const result = runShim(['bridge', 'lifecycle', '--json'], { stdin: '{}', env: { COLONY_BRIDGE_FAST: '0' }, - nodeStub: stubNode, + cliStub, logFile: stubLog, }); @@ -139,7 +155,7 @@ describe('bin/colony.sh', () => { it('passes through non-bridge-lifecycle commands unchanged', () => { const result = runShim(['--version'], { - nodeStub: stubNode, + cliStub, logFile: stubLog, }); @@ -151,7 +167,7 @@ describe('bin/colony.sh', () => { const result = runShim(['bridge', 'lifecycle'], { stdin: '{}', env: { COLONY_WORKER_PORT: freeUnusedPort() }, - nodeStub: stubNode, + cliStub, logFile: stubLog, }); @@ -161,10 +177,10 @@ describe('bin/colony.sh', () => { expect(result.log).not.toContain('--json'); }); - it('passes through `bridge replay ` unchanged (no fast-path, Node owns it)', () => { + it('passes through `bridge replay ` unchanged (no fast-path, CLI owns it)', () => { const result = runShim(['bridge', 'replay', 'foo.pre.json'], { env: { COLONY_WORKER_PORT: freeUnusedPort() }, - nodeStub: stubNode, + cliStub, logFile: stubLog, }); diff --git a/apps/worker/src/server.ts b/apps/worker/src/server.ts index 0494b54..00c04e1 100644 --- a/apps/worker/src/server.ts +++ b/apps/worker/src/server.ts @@ -176,7 +176,7 @@ export function buildApp( }); // Daemon fast-path for `colony bridge lifecycle`. The bin shim at - // apps/cli/bin/colony.sh POSTs the envelope here when the worker is up, + // apps/cli/bin/colony.mjs POSTs the envelope here when the worker is up, // skipping a per-event Node cold start. Falls through to the in-process // Node CLI on any failure (see shim's stdin-replay logic), so writes // still succeed when the daemon is down — that's the contract that diff --git a/scripts/bench-bridge-fastpath.mjs b/scripts/bench-bridge-fastpath.mjs index af5a7a6..4fb2b39 100644 --- a/scripts/bench-bridge-fastpath.mjs +++ b/scripts/bench-bridge-fastpath.mjs @@ -2,7 +2,7 @@ // Benchmark the bridge-lifecycle fast-path vs the in-process Node fallback. // // Boots a fresh worker against a temp HOME, then runs N concurrent -// invocations of `apps/cli/bin/colony.sh bridge lifecycle --json ...` with +// invocations of `apps/cli/bin/colony.mjs bridge lifecycle --json ...` with // realistic OMX envelopes — once with the daemon reachable (fast path), // once with COLONY_BRIDGE_FAST=0 (forced fallback, mirrors today's behavior // before this PR). Reports wall time, mean, p95. @@ -19,7 +19,7 @@ import { fileURLToPath } from 'node:url'; const HERE = dirname(fileURLToPath(import.meta.url)); const REPO = resolve(HERE, '..'); -const SHIM = resolve(REPO, 'apps/cli/bin/colony.sh'); +const SHIM = resolve(REPO, 'apps/cli/bin/colony.mjs'); const CONCURRENCY = Number(process.argv[2] ?? 8); const ITERATIONS = Number(process.argv[3] ?? 4); @@ -53,7 +53,7 @@ function mkenvelope(i) { function runOnce(envelope, env) { const start = process.hrtime.bigint(); const result = spawnSync( - 'sh', + process.execPath, [SHIM, 'bridge', 'lifecycle', '--json', '--ide', 'claude-code', '--cwd', '/tmp/bench'], { input: envelope,