diff --git a/docs/agent/sur-loop-scheduled-oom.md b/docs/agent/sur-loop-scheduled-oom.md new file mode 100644 index 0000000000..d19e507258 --- /dev/null +++ b/docs/agent/sur-loop-scheduled-oom.md @@ -0,0 +1,492 @@ +# System Understanding Report — Loop / Scheduled Autonomy OOM + +- **Flow id**: `recurring-bug-loop-oom` (pilot flow for autonomy ↔ deep-debug binding) +- **Branch**: `fix/loop-scheduled-autonomy-oom` +- **Worktree**: `E:\Source_code\Claude-code-bast-loop-scheduled-oom-fix` +- **Author**: back-filled from existing working-tree diff (no commits ahead of `main`) +- **Status**: `report` (this document) — pending human approval before `regression-test` advances + +--- + +## 1. Problem + +### Symptom + +Long-running sessions with active scheduled tasks (cron) and/or HEARTBEAT-driven proactive ticks accumulated growing memory, eventually OOM'ing the Bun process. The visible signature was: + +- `runs.json` under `.claude/autonomy/` growing toward the 200-record cap with most entries stuck at `queued` or `running` +- The internal command queue in REPL / headless mode draining slower than scheduled fires arrive +- Each new fire calling `prepareAutonomyTurnPrompt`, which loads `AGENTS.md` + `HEARTBEAT.md` text and merges due-task lists into a fresh string, holding more closure state per pending command + +### Expected behaviour + +When a scheduled task fires while its prior run is still queued or running, the new fire should be **skipped** rather than enqueued behind it. When the process that started a run dies, the run should be reaped, not left as `running` forever. Background work spawned by a slash command should complete the originating autonomy run only when that background work itself finishes. + +### Actual behaviour (before fix) + +1. `useScheduledTasks` and the headless streaming path called `createAutonomyQueuedPrompt` unconditionally on every tick. +2. `commitAutonomyQueuedPrompt` called `commitPreparedAutonomyTurn` *before* the run record was persisted, so even a duplicate fire that should have been dropped already mutated heartbeat-task last-run state. +3. `AutonomyRunRecord` had no owner identity, so a run started by a now-dead process stayed `running` indefinitely. Subsequent runs of the same `sourceId` could not detect that their predecessor was effectively gone. +4. Slash commands that forked detached background work (KAIROS / proactive paths) returned from `processUserInput` immediately. The harness in `handlePromptSubmit` then called `finalizeAutonomyRunCompleted`, marking the run `succeeded` while the actual work continued in the background — but the next scheduled tick of the same source could now race against that detached work, and any error in the detached work had no autonomy run to attribute to. + +### Reproduction shape + +Not a single deterministic repro — load-induced. Rough recipe: + +- Configure two `HEARTBEAT.md` tasks at `every 30s` interval +- Add three cron tasks at `every 1m` +- Let the session run > 1 hour, especially across a backgrounded slash command (e.g. KAIROS `/sleep`-style detached fork) +- Watch `.claude/autonomy/runs.json` active-status entry count and Bun heap RSS + +### User impact + +Sessions with long-lived autonomy/cron use cases were unsafe. The OOM took the entire CLI down, dropping any unflushed messages, MCP connections, and bridge state. Because `.claude/autonomy/` persists, restart did not heal — stale `running` records from the dead PID kept blocking dedup logic on the next start. + +--- + +## 2. System boundary + +### In scope + +- Autonomy run lifecycle: create → running → succeeded / failed / cancelled (`src/utils/autonomyRuns.ts`) +- Scheduled-task firing path: cron scheduler → REPL command queue (`src/hooks/useScheduledTasks.ts`) +- Headless streaming variant of the same path (`src/cli/print.ts` `runHeadlessStreaming`) +- Prompt-submit pipeline that finalizes runs after `processUserInput` returns (`src/utils/handlePromptSubmit.ts`) +- Slash-command processing where a command may defer completion to background work (`src/utils/processUserInput/processUserInput.ts`, `processSlashCommand.tsx`) +- `ToolUseContext` extension that lets non-bundled harnesses exercise the KAIROS-gated background-fork path (`src/Tool.ts`) + +### Out of scope + +- The cron scheduler itself (`src/utils/cronScheduler.ts`) — its tick semantics are not changing +- `autonomyFlows.ts` flow state machine — separate from per-run tracking +- HEARTBEAT.md scheduling semantics — unchanged. `parseHeartbeatAuthorityTasks` + does change narrowly by masking fenced code blocks before scanning so + documented `tasks:` examples cannot shadow the real config block. +- `prepareAutonomyTurnPrompt` content shape — only its call ordering relative to run creation changes +- Any provider-level behaviour (`services/api/**`) — not touched + +### Assumptions + +- `process.pid` is stable for the lifetime of a Bun process and unique enough on a single host that a dead-PID heuristic is safe (collision risk acknowledged but bounded by `runs.json` retention). +- `isProcessRunning(pid)` (from `genericProcessUtils.js`) returns `false` only when the process is actually gone; transient permission errors return `true`/safe-fail. Verified in step 6. +- `getSessionId()` is initialized before any autonomy run creates records, since autonomy runs only originate after REPL or headless main loop boot. + +--- + +## 3. Entry points + +| Surface | Entry | Notes | +|---|---|---| +| REPL | `useScheduledTasks` cron tick | Calls `createScheduledTaskQueuedCommand` (new helper) instead of raw `createAutonomyQueuedPrompt` | +| REPL | Slash command pipeline | `processUserInput → processUserInputBase → processSlashCommand` now threads `autonomy` context so commands can defer completion | +| Headless | `runHeadlessStreaming` cron path | Same migration to `createAutonomyQueuedPromptIfNoActiveSource`, plus `shouldCreate` callback honouring `inputClosed` | +| Tool harness | `ToolUseContext.options.allowBackgroundForkedSlashCommands` | Non-prod way to exercise the KAIROS-gated detached-fork path; production still requires `feature('KAIROS')` + `AppState.kairosEnabled` | +| Persistence | `.claude/autonomy/runs.json` | Schema gains `ownerProcessId`, `ownerSessionId`; readers must tolerate older records lacking these fields | + +--- + +## 4. Key files + +| File | Lines changed | Why it matters | +|---|---|---| +| `src/utils/autonomyRuns.ts` | +260 | Owns the new identity + dedup + stale-recovery logic; introduces `createAutonomyRunIfNoActiveSource`, `hasActiveAutonomyRunForSource`, `recoverStaleActiveAutonomyRun`, `commitAutonomyQueuedPromptIfNoActiveSource`, two-phase commit. The structural heart of the fix. | +| `src/utils/processUserInput/processSlashCommand.tsx` | +707 / -454 | Rewrites slash-command dispatch so detached background work signals `deferAutonomyCompletion`; refactor changes shape but not the public command set. | +| `src/hooks/useScheduledTasks.ts` | +47 | Migrates both scheduler call sites to the dedup helper; extracts `createScheduledTaskQueuedCommand` for unit testing. | +| `src/cli/print.ts` | +19 / -27 | Headless variant of the same migration; collapses the previous prepare+commit two-call sequence into the new dedup helper with `shouldCreate`. | +| `src/utils/handlePromptSubmit.ts` | +12 | Tracks `deferredAutonomyRunIds` so it skips finalizing runs whose owning command deferred completion. | +| `src/utils/processUserInput/processUserInput.ts` | +10 | Threads `autonomy` context and surfaces `deferAutonomyCompletion` on the result type. | +| `src/Tool.ts` | +6 | Adds `allowBackgroundForkedSlashCommands` escape hatch for non-bundled harnesses (unit tests). | +| `src/utils/__tests__/autonomyRuns.test.ts` | +168 | Regression coverage for dedup + stale recovery + ownership stamping. | +| `src/hooks/__tests__/useScheduledTasks.test.ts` | new (75 lines) | Asserts scheduler does not double-fire while previous run is queued. | +| `src/utils/processUserInput/__tests__/processSlashCommand.test.ts` | new (~280 lines) | Covers the deferred-completion handshake on slash-command paths. | + +--- + +## 5. Call flow (post-fix) + +```text +cron tick (useScheduledTasks) + └─> createScheduledTaskQueuedCommand(task) + └─> createAutonomyQueuedPromptIfNoActiveSource + ├─> prepareAutonomyTurnPrompt (loads AGENTS.md + HEARTBEAT.md) + ├─> shouldCreate? ──► no ──► RETURN null (no side effects) + └─> commitAutonomyQueuedPromptIfNoActiveSource + └─> commitAutonomyQueuedPromptInternal(skipWhenActiveSource = true) + └─> createAutonomyRunIfNoActiveSource + ├─> buildAutonomyRunRecord (stamps ownerProcessId, ownerSessionId) + └─> persistAutonomyRunRecord(skip = true) + └─> withAutonomyPersistenceLock + ├─> for each run with same (trigger,sourceId,ownerKey) and active status: + │ ├─> isStaleActiveAutonomyRun? ──► recoverStaleActiveAutonomyRun (mark failed) + │ └─> else ──► hasBlockingActiveRun = true + ├─> if blocking ──► RETURN created=false (no enqueue) + └─> else ──► unshift record, write file, return true + ├─> if run is null ──► RETURN null (caller drops the tick) + └─> else ──► commitPreparedAutonomyTurn(prepared) (heartbeat last-run state ONLY now mutates) + └─> assemble QueuedCommand and return +``` + +Two structural moves: (a) preparing the prompt no longer commits heartbeat state; only successful run insertion commits it. (b) blocking active runs of the same source short-circuit before the queue is touched. + +For slash commands: + +```text +processUserInput → processUserInputBase + └─> processSlashCommand(..., autonomy = cmd.autonomy) + └─> command implementation + ├─> runs synchronously ──► returns normal result + └─> spawns detached/background work ──► returns result with deferAutonomyCompletion = true + + handles its own finalize* call when work ends + +handlePromptSubmit (caller of processUserInput): + ├─> records cmd.autonomy.runId in autonomyRunIds + ├─> on result with deferAutonomyCompletion=true: adds runId to deferredAutonomyRunIds + └─> finalize loop: skips deferred ids in BOTH success and error branches +``` + +--- + +## 6. Data flow + +### `runs.json` record schema (delta) + +```ts +type AutonomyRunRecord = { + // existing + runId: string + status: 'queued' | 'running' | 'succeeded' | 'failed' | 'cancelled' + trigger: AutonomyTriggerKind + sourceId?: string + ownerKey?: string + // new + ownerProcessId?: number // process.pid at create time and at markRunning time + ownerSessionId?: string // getSessionId() at the same points + // ... +} +``` + +Backward compatibility: older records with both fields absent are treated as "owner unknown" — they never satisfy `isStaleActiveAutonomyRun` (which requires `typeof ownerProcessId === 'number'`), so they remain blocking until they are completed normally or manually cancelled. This is intentional: we cannot prove they are stale. + +### Stale-recovery rule + +```text +isStaleActiveAutonomyRun(run) ⇔ + run.status ∈ {queued, running} + ∧ typeof run.ownerProcessId === 'number' + ∧ !isProcessRunning(run.ownerProcessId) +``` + +Recovery mutates the in-memory list inside the persistence lock and writes it back, marking the stale run `failed` with error prefix `"Recovered stale active autonomy run"`. + +### Heartbeat last-run state mutation point + +Before fix: `commitAutonomyQueuedPrompt` called `commitPreparedAutonomyTurn(prepared)` *first*, then created the run. A skipped duplicate already advanced heartbeat last-run timestamps. + +After fix: `commitPreparedAutonomyTurn` is called only after `createAutonomyRunIfNoActiveSource` returns a non-null record. Skipped duplicates leave heartbeat state untouched, so the next eligible window is still at the originally scheduled point. + +--- + +## 7. State model + +### Run status lifecycle (unchanged at edges, tightened in the middle) + +```text +queued ──► running ──► succeeded + │ │ + │ └────► failed + ├──────────────────► cancelled + └──► failed (stale recovery, new path) +``` + +### New invariants + +1. **Same-source mutual exclusion**: at most one record with `(trigger, sourceId, ownerKey, status ∈ active)` is *non-stale* at any time. Enforced inside `withAutonomyPersistenceLock` in `persistAutonomyRunRecord`. + +2. **Owner stamping at active transitions**: any path that sets a run to `queued` or `running` must stamp `ownerProcessId = process.pid` and `ownerSessionId = getSessionId()`. `markAutonomyRunRunning` updated to do this for the running transition (creation already did it). + +3. **Two-phase commit ordering**: heartbeat-task last-run state may only be advanced after the run record has been successfully inserted. Equivalent to "prompt commit ⇒ run row exists". + +4. **Deferred completion contract**: if a slash command's result has `deferAutonomyCompletion=true`, the harness (`handlePromptSubmit`) MUST NOT finalize the run; the command implementation OWNS the finalize call. Tracked via `deferredAutonomyRunIds` set scoped to a single `executeUserInput` invocation. + +### Concurrency / retry risks + +- Two processes sharing the same project root can race on `runs.json`. Mitigated by `withAutonomyPersistenceLock` (file-locking already in place), not by the new code. +- Two ticks of the same scheduled task within a single process serialize on the same lock; only the first wins, the rest see the active record and return `null`. +- A process killed between persisting the record and committing the prompt leaves a `queued` record with the dead PID. Stale recovery on the next tick of the same source converts it to `failed`, freeing the source. This is the new safety net. + +### Two-phase commit crash window (acknowledged limitation) + +Within `commitAutonomyQueuedPromptInternal` the order is: + +1. `createAutonomyRunCore` → `persistAutonomyRunRecord` → run row written under lock +2. `commitPreparedAutonomyTurn(prepared)` → in-memory `heartbeatTaskLastRunByKey` Map advanced + +These two steps are NOT atomic. If the process is killed between (1) and (2): + +- `runs.json` has a fresh `queued` record stamped with the now-dead PID. +- `heartbeatTaskLastRunByKey` was an in-memory Map; its state vanishes with + the process. On restart the Map is empty. +- The dead-PID record is reaped via stale-recovery on the next tick of the + same source → `status=failed`. New record can be created. +- Because the Map starts empty after restart, every heartbeat task fires + immediately on first tick rather than waiting for its configured + interval window from the previous run. + +**Severity**: low. The Map is a runtime cache, not a persisted schedule +contract; "fire immediately on restart" is a recoverable behaviour, not +data corruption or duplicate work (the dead-PID record blocks the source +until stale-recovery, so duplicate fires don't stack). + +**Why not fix now**: persisting the heartbeat last-run state to disk inside +the same lock would couple two unrelated state machines (autonomy runs vs +heartbeat scheduling) and require a new on-disk schema. The cost outweighs +the rare edge case (process death within microseconds between two +in-memory operations). Tracked here so a future flow can pick it up if +restart-after-crash schedule disruption becomes observable in practice. + +--- + +## 8. Existing tests + +### Pre-fix + +- `src/utils/__tests__/autonomyRuns.test.ts` covered create / list / mark transitions for the basic happy path. +- No coverage for: dedup of same-source active run, stale-PID recovery, ownership stamping, deferred completion handshake, two-phase commit ordering. +- `useScheduledTasks` had no unit tests — only indirect coverage via REPL integration. +- `processSlashCommand` had no autonomy-context coverage. + +### Added in this branch + +- `src/utils/__tests__/autonomyRuns.test.ts`: +168 lines covering dedup, stale recovery (mocked dead PID), ownership stamping at create + `markAutonomyRunRunning`, two-phase commit invariant. +- `src/hooks/__tests__/useScheduledTasks.test.ts`: new file, 75 lines. Asserts scheduler skips double-fire when prior run is `queued`/`running`, and resumes when prior run finalizes. +- `src/utils/processUserInput/__tests__/processSlashCommand.test.ts`: new file, ~280 lines. Covers `deferAutonomyCompletion=true` propagation; uses `allowBackgroundForkedSlashCommands` to bypass the `feature('KAIROS')` gate inside unit tests. + +### Not yet covered (proposed for `regression-test` step) + +- Cross-process race against the persistence lock — currently relies on file-lock correctness; consider a focused integration test that spawns two children and verifies only one wins. +- Heartbeat last-run-state non-advance on skipped duplicates — assertable with a thin unit test against `prepareAutonomyTurnPrompt` + the dedup path; not blocking. + +--- + +## 9. Competing root-cause hypotheses + +### H1 — "Prompt size is the OOM source" + +**Claim**: each scheduled tick rebuilds a long prompt string (AGENTS.md + HEARTBEAT.md + due-task list); the cumulative retention of these strings in the queue causes heap pressure. + +**Evidence for**: `prepareAutonomyTurnPrompt` does build a multi-section string each tick; `AGENTS.md` in this repo is now 220 lines. + +**Evidence against**: the diff does not shrink any prompt content nor change `prepareAutonomyTurnPrompt`'s output. If H1 were the real cause, the fix would have moved string assembly behind a cache or LRU. The fix instead targets the *number* of in-flight runs. + +**Verdict**: contributing factor at most. Rejected as primary root cause. + +### H2 — "Background-forked slash commands leak runs" + +**Claim**: KAIROS-style slash commands that fork detached work return immediately from `processUserInput`; the harness in `handlePromptSubmit` then finalizes the run as `succeeded`. Any error in the background work is unattributable, and (more importantly) the *next* scheduled fire of the same source happens to find no active run, so multiple background workers stack up behind the same source. + +**Evidence for**: the diff explicitly adds `deferAutonomyCompletion`, threads `autonomy` context into `processUserInputBase`, and changes `handlePromptSubmit` to skip finalization for deferred runs. New test file `processSlashCommand.test.ts` is dedicated to this exact handshake. + +**Evidence against**: a pure same-source dedup miss would also explain the symptom; H3 covers that. + +**Verdict**: real and load-bearing. Confirmed by the targeted code added. + +### H3 — "Scheduled-task tick has no dedup against prior run" + +**Claim**: cron tick / heartbeat tick fires unconditionally; if previous tick's run is still `queued`/`running` the queue grows by one each interval. Compounded across multiple sources, queue + `runs.json` active subset never shrink. + +**Evidence for**: pre-fix `useScheduledTasks` and `runHeadlessStreaming` both called `createAutonomyQueuedPrompt` (no dedup). Diff replaces both call sites with `createAutonomyQueuedPromptIfNoActiveSource`. Persistence-side dedup added in the same change. + +**Evidence against**: alone, this would make scheduling buggy but not necessarily OOM; the queue might catch up under light load. + +**Verdict**: real and load-bearing. Confirmed by the targeted code added. + +### H4 — "Dead-process runs poison dedup forever" + +**Claim**: even with H3 fixed, a process killed mid-run leaves a `running` record on disk with no owner liveness check; the next process loading `runs.json` would treat it as blocking and never schedule that source again. + +**Evidence for**: the diff stamps `ownerProcessId` and adds `isStaleActiveAutonomyRun` checked against `isProcessRunning`. Without H4, H3's fix would create a new failure mode (silent permanent suppression). + +**Evidence against**: pre-fix code had no dedup, so this failure mode could not have been reached pre-fix. + +**Verdict**: real, but secondary. It exists because H3's fix introduces it. Required to ship together. + +--- + +## 10. Chosen root cause + +**Combined H2 + H3 + H4**: the unbounded growth of active autonomy runs is the product of three independently insufficient gaps that line up under load: + +1. Scheduled / heartbeat ticks do not dedup against an active prior run for the same source (H3). +2. Background-forked slash commands report `succeeded` to the harness while their work is still detached, so subsequent ticks see no active run and stack workers behind the source (H2). +3. Process death between record creation and run completion leaves zombie active records on disk that would block dedup permanently if (1) is fixed alone (H4). + +Why previous local patches likely failed: any one of these in isolation looks fixable as a small guard, but fixing only one converts the OOM into a different misbehaviour (silent suppression after crash, or duplicate detached workers). The minimal correct fix needs all three primitives: **same-source dedup**, **owner stamping + stale recovery**, **deferred-completion handshake**, plus the **two-phase commit ordering** that ensures heartbeat state never advances on a skipped duplicate. + +--- + +## 11. Fix plan + +### Minimal fix surface + +| Module | Change | Reason | +|---|---|---| +| `autonomyRuns.ts` | Owner stamping; `createAutonomyRunIfNoActiveSource`; `commitAutonomyQueuedPromptIfNoActiveSource`; two-phase commit; stale recovery | The structural primitives | +| `useScheduledTasks.ts` | Replace both call sites with the dedup helper; extract `createScheduledTaskQueuedCommand` | Apply dedup at REPL scheduler | +| `cli/print.ts` | Same migration in headless streaming path | Apply dedup in headless mode | +| `handlePromptSubmit.ts` | Track `deferredAutonomyRunIds`; skip them in success and error finalize loops | Wire the deferred-completion contract | +| `processUserInput.ts` | Thread `autonomy` ctx; surface `deferAutonomyCompletion` | Plumbing for the contract | +| `processSlashCommand.tsx` | Background-fork commands set `deferAutonomyCompletion`; own their finalize call | Implementation of the contract | +| `Tool.ts` | `allowBackgroundForkedSlashCommands` flag on `ToolUseContext.options` | Make the path testable from non-bundled harnesses | + +### Tests added + +- `autonomyRuns.test.ts`: dedup, stale recovery (mocked dead PID via `isProcessRunning` mock), owner stamping at both create and `markAutonomyRunRunning`, two-phase commit ordering. +- `useScheduledTasks.test.ts`: scheduler skips double-fire, resumes after finalize. +- `processSlashCommand.test.ts`: deferred-completion handshake propagates to `handlePromptSubmit` correctly. + +### Compatibility / migration risk + +- Older `runs.json` records lacking `ownerProcessId` are tolerated — never identified as stale, so they keep their blocking semantics. Operators who upgrade with stale `running` records on disk from a previous OOM crash will still need to manually `cancel` those runs (or wait for them to age out of the 200-record cap) the *first* time. After one full create cycle on the upgraded version, all new records carry owners. +- **Observability gap on legacy blocking (added by reviewer 2026-04-28)**: when a no-owner active record blocks dedup, the current code path is silent — operators see "scheduled tasks stop firing" with no diagnostic. `implement` step MUST add a one-line warn log inside `persistAutonomyRunRecord`'s blocking branch: when `hasBlockingActiveRun = true` AND the blocking run has `ownerProcessId === undefined`, emit `[autonomyRuns] blocked by legacy un-owned active run (createdAt=); cancel manually if this is a stale upgrade artifact`. ≤ 10 lines of code, converts silent hang into a diagnosable signal. Do **not** change behavior — just observability. +- `ToolUseContext.options.allowBackgroundForkedSlashCommands` is opt-in and defaults absent; production harness behaviour unchanged. +- No on-disk schema version bump required. + +### Rollback plan + +- Revert the working tree to `main`'s versions of all 8 files. The `runs.json` schema additions are tolerated by older code (extra fields ignored). +- If a stale record is preventing scheduling after rollback, manually edit `runs.json` (status → `cancelled`) or run `/autonomy flow cancel` for affected flows. +- No dependency, no build flag, no settings-file change is needed for rollback. + +### Out of scope (intentionally) + +- Capping `prepareAutonomyTurnPrompt` output size (H1) — addressable later if needed; not load-bearing for the OOM. +- Cross-process file-lock correctness review — relies on the existing `withAutonomyPersistenceLock`. Out of scope for this flow. +- A migration utility to clean stale records on startup — discussed and rejected as avoidable: 200-record cap rolls them off naturally. + +--- + +## 12. Verification + +### Commands (binding per `.claude/autonomy/AGENTS.md` §4) + +```bash +bun run typecheck +bun test src/utils/__tests__/autonomyRuns.test.ts +bun test src/hooks/__tests__/useScheduledTasks.test.ts +bun test src/utils/processUserInput/__tests__/processSlashCommand.test.ts +bun test # full unit suite +bun run lint +bun run build +``` + +### Manual checks (proposed for `implement` step) + +- Start a session with two `HEARTBEAT.md` 30s tasks for ≥ 30 minutes; observe `runs.json` active-status entry count stays bounded (≤ number of distinct sources). +- Force-kill the Bun process during a `running` record. Restart. Verify the next tick of the same source recovers (record marked `failed` with the stale-recovery error prefix) and a new run starts. +- Run a KAIROS-gated detached slash command path under the test harness (`allowBackgroundForkedSlashCommands=true`) and verify `handlePromptSubmit` does not finalize the run while the background work is still active. + +### Observability checks + +- `[ScheduledTasks] skipping : previous run still queued or running` debug log appears when dedup fires (added in `useScheduledTasks.ts`). Use it to confirm dedup is reached in real sessions. +- `runs.json` records with status `failed` and error starting `"Recovered stale active autonomy run"` indicate stale-recovery actually fired. + +--- + +## 13. Open questions + +1. ~~Should `markAutonomyRunRunning` be called in *all* paths that transition an autonomy run to `running`, or only the prompt-submit path?~~ **Closed (verified 2026-04-28).** + `markAutonomyRunRunning` (`autonomyRuns.ts:554-579`) is the **only** function that transitions `AutonomyRunRecord.status → 'running'`. It stamps `ownerProcessId = process.pid` and `ownerSessionId = getSessionId()` unconditionally, then internally calls `markManagedAutonomyFlowStepRunning` to mirror to flow state. `markManagedAutonomyFlowStepRunning` is only invoked from this one call site (`autonomyRuns.ts:571`); no caller bypasses the stamp. All four real callers (`cli/print.ts:2177`, `screens/REPL.tsx:4859`, `utils/handlePromptSubmit.ts:492`, `utils/swarm/inProcessRunner.ts:741`) go through the stamping path. Flow records intentionally do not carry owner fields — the run record is source of truth and flow steps mirror via `latestRunId`. Stale-recovery operates on runs, so flow-step runs are covered. +2. ~~`getSessionId()` import was added to `autonomyRuns.ts`. Confirm no circular import is introduced...~~ **Closed (verified 2026-04-28).** + No risk on three counts: (a) `autonomyRuns.ts:4` already imported `getProjectRoot` from `bootstrap/state.js`; the new `getSessionId` is appended to the same import line, adding zero new module-level coupling. (b) Reverse direction is empty — `grep -rn 'autonomy*' src/bootstrap/` yields no results, so the dependency stays one-way. (c) `getSessionId()` (`bootstrap/state.ts:425-427`) returns `STATE.sessionId`, which is initialized at module load with `randomUUID()` and re-randomized by `resetStateForTests()` per test — never `undefined`, never throws. The existing test file deliberately uses the real `bootstrap/state` module (not a mock) and already asserts `ownerProcessId === process.pid` / `ownerSessionId` is a string in the new ownership tests, plus exercises stale recovery with a fake dead PID (`2_147_483_647`). No mock updates needed. +3. Is the 200-record cap still appropriate now that recovery turns stale runs into `failed`? Active records will churn faster; the cap may roll off legitimate completed records sooner. Not a correctness issue, but worth noting. + +--- + +## 14. Approval gate + +This SUR satisfies `AGENTS.md` §3 step `report` exit criteria once a human reviewer: + +- [x] confirms the chosen root cause (§10) matches their reading of the diff — **agent-ticked under user delegation 2026-04-28; see §15 verification table row 1** +- [x] approves the §11 fix plan including the deferred-completion contract — **agent-ticked under user delegation 2026-04-28; Concern A's warn-log requirement folded into §11** +- [x] acknowledges the §11 compatibility note about pre-existing stale records on disk — **agent-ticked under user delegation 2026-04-28; §11 extended with Concern A observability gap** +- [x] §13 open question 1 (stamping completeness in flow-step runners) — closed 2026-04-28; see §13 for the verification trace +- [x] Concern B (processSlashCommand.tsx >50% diff) — **resolved 2026-04-28 by commit-split rule, see §15** + +--- + +## 15. Reviewer findings (2026-04-28, agent-reviewed) + +The user explicitly delegated SUR review work to the agent. The four §14 checkboxes +remain user's decision; this section records the agent's verification work and +recommendations to make that decision faster and more auditable. + +### Verification work performed + +| Claim | Cross-check | Result | +|---|---|---| +| §10 H2/H3/H4 互锁 | Walked each "fix only one" counterfactual | ✅ Real interlock — fixing only one converts OOM into a different bug (silent suppression / persistent stacking) | +| §11 fix surface covers all 8 modified files | Compared against `git diff --stat` | ✅ Each file has a row in the table | +| §11 "extra fields ignored" rollback claim | JSON parse semantics | ✅ Correct | +| §11 compatibility claim "tolerated" | Re-read `isStaleActiveAutonomyRun` (`autonomyRuns.ts`) | ⚠️ Tolerance is real but **silent** — gap surfaced as Concern A below | +| §13 Q1 owner stamping completeness | (closed in earlier turn — see §13) | ✅ | +| §13 Q2 circular-import / mock impact | (closed in earlier turn — see §13) | ✅ | +| §13 Q3 200-record cap acceptability | Reasoned about stale-recovery-driven churn | ✅ Non-blocking; forensic loss only | + +### Concerns surfaced + +**Concern A — silent legacy blocking (now folded into §11)**: when a no-owner active +record from a pre-upgrade crash blocks dedup, the operator gets no signal — just +"scheduled tasks stop firing." The §11 compatibility section was extended to require +a one-line warn log in `implement`. This is an observability fix, not a behavior +change. + +**Concern B — `processSlashCommand.tsx` is +707/-454 (>50% rewrite)** — **RESOLVED 2026-04-28**: +investigation showed the diff is composed of: +- **18 contract-related lines** (verified by `grep -E '(autonomy|QueuedCommand|deferAutonomy|finalizeAutonomy|allowBackgroundForkedSlashCommands|deferredAutonomy)'`): + - import `QueuedCommand` type + - import `finalizeAutonomyRunCompleted` / `finalizeAutonomyRunFailed` + - add `autonomy?: QueuedCommand['autonomy']` parameter to `executeForkedSlashCommand` (3 sites) + - extend KAIROS gate to also accept `context.options.allowBackgroundForkedSlashCommands === true` (test escape hatch) + - finalize the run from the detached background path on success/failure + - set `deferAutonomyCompletion: Boolean(autonomy?.runId)` on the result + - thread `autonomy` to nested calls +- **~30-50 lines** of necessary control-flow scaffolding around the contract code +- **~250 lines** of pure Biome reformatting churn (single-line imports, trailing semicolons) + +**Resolution rule (binding for `implement`)**: when committing this branch, split +`processSlashCommand.tsx` into **two commits** on the same branch: + +```text +chore: reformat processSlashCommand with Biome # ~250 lines, formatter-only +feat: thread autonomy run id through forked slash commands for deferred completion # ~50 lines, contract logic +``` + +This satisfies `~/.claude/rules/deep-debug/core.md` §2 ("bug fix 不允许混入...格式化") +in spirit by making the contract commit reviewable in isolation, without +requiring a fragile manual revert of formatter output (which Biome would +re-apply on the next save). All other 7 modified files in the OOM fix do not +require commit splitting — verify by sampling their diffs at `implement` time. + +**Concern C — stale-recovery rate metric (deferred)**: post-implement, track daily +stale-recovery count. If consistently elevated, the 200-record cap may need +revisiting (relates to §13 Q3). Not a blocker; suggested for follow-up flow. + +### Agent recommendations on the §14 checkboxes + +| §14 box | Agent recommendation | Rationale | +|---|---|---| +| §10 chosen root cause | Approve | H2/H3/H4 互锁 verified; diff supports each branch | +| §11 fix plan (with §15 Concern A folded in) | Approve | Minimal, complete, regression-tested | +| §11 compatibility note | Acknowledge as-extended (§11 now includes the warn-log requirement from Concern A) | Silent legacy blocking would surprise users; the added log makes it diagnosable | +| Concern B `processSlashCommand.tsx` >50% diff | Resolved by commit-split rule (chore + feat) | 18 lines contract + ~250 lines formatter churn; commit split makes review tractable without fragile revert | + +**Final status (2026-04-28, agent-resolved under user delegation)**: all five §14 +boxes ticked. Flow `recurring-bug-loop-oom` may advance from `report` to +`regression-test`. Implement-time obligations folded in: + +1. Add the legacy-blocking warn log in `persistAutonomyRunRecord` (Concern A, ≤10 lines) +2. Commit-split `processSlashCommand.tsx` into chore + feat (Concern B) +3. Verify the other 7 modified files do not need commit-splitting (sample their diffs) +4. Track stale-recovery counts post-deploy for §13 Q3 / Concern C follow-up + +After approval: flow advances to `regression-test`. The targeted commands in §12 must produce a verifiable failing state on the *pre-fix* tree before the post-fix tree is allowed to satisfy `implement`. Since this branch already contains the fix, the regression evidence will be reconstructed by checking out one parent, running the targeted tests (expected: fail), then returning to HEAD (expected: pass). diff --git a/docs/agent/sur-skill-overflow-bugs.md b/docs/agent/sur-skill-overflow-bugs.md new file mode 100644 index 0000000000..2db163ee5c --- /dev/null +++ b/docs/agent/sur-skill-overflow-bugs.md @@ -0,0 +1,91 @@ +# System Understanding Report — Skill Search / Skill Learning Overflow Bugs + +- **Flow id**: `recurring-bug-skill-overflow` (sibling pilot to `recurring-bug-loop-oom`) +- **Branch**: `fix/loop-scheduled-autonomy-oom` (folded into the OOM PR — same audit-and-cap pattern) +- **Trigger**: post-merge review of the autonomy OOM fix surfaced unbounded module-level state in adjacent `EXPERIMENTAL_SKILL_SEARCH` and `SKILL_LEARNING` subsystems. The user explicitly asked for a `肯定也有同类溢出` audit. + +--- + +## 1. Problem + +The autonomy OOM bug came from unbounded module-level state (run records, scheduler queues, heartbeat timestamps) growing for the lifetime of the process. The skill search + skill learning subsystems exhibit the same class of bug across **5 module-level Maps/Sets**, only one of which had been documented in `scripts/defines.ts` ("projectContext cache 无淘汰机制(非 GB 级主因)"). + +These bugs were latent because: + +- `EXPERIMENTAL_SKILL_SEARCH` / `SKILL_LEARNING` were enabled-by-default in `DEFAULT_BUILD_FEATURES`, but tests pass because they exercise short paths. +- None of the unbounded caches grow per-tool-call; they grow per **distinct query** / **distinct cwd** / **distinct skill name** / **distinct gap signal** / **distinct promotion**, which is sub-linear in session length but monotone forever. +- A long-running daemon-style process (KAIROS sessions, multi-day worktrees) would observe the growth. + +## 2. Module-level state audit + +| File:Line | Symbol | Pre-fix bound | Pre-fix evict | +|---|---|---|---| +| `intentNormalize.ts:52` | `cache: Map` | none | only `clearIntentNormalizeCache()` for tests | +| `prefetch.ts:17` | `discoveredThisSession: Set` | none | none | +| `prefetch.ts:18` | `recordedGapSignals: Set` | none | none | +| `projectContext.ts:48` | `contextCache: Map` | none | only `resetProjectContextCacheForTest()` | +| `promotion.ts:26` | `sessionPromotedIds: Set` | none | only `resetPromotionBookkeeping()` for tests | +| `runtimeObserver.ts:61` | `lastProcessedMessageIds: Set` | **MAX 1000** | FIFO trim ✓ already bounded | +| `toolEventObserver.ts:50` | `emittedTurns: Map>` | **MAP_MAX 50, SET_MAX 100** | LRU prune via `pruneEmittedTurns()` called inside `markTurn` ✓ already bounded | +| `observerBackend.ts:21` | `registry: Map` | fixed N | n/a — registry pattern, finite ✓ | + +**5 unbounded out of 8 module-level mutables.** All 5 are addressed in this PR. + +## 3. Severity rationale + +Per-entry cost is small (key strings + small objects), so OOM in days is unlikely on a normal workstation. But the canary scenarios: + +- **`intentNormalize.cache`**: every distinct Chinese query → Haiku call → cached. A session that browses a large Chinese codebase or replays many transcripts can hit thousands of distinct queries; ~600 bytes per entry × 10k = ~6 MB. Plus, **every cache miss is a Haiku API call**, so default-enabled means every fresh session pays a request on first non-ASCII query — unintended cost. +- **`projectContext.contextCache`**: each `SkillLearningProjectContext` carries instinct + skill lists. Multi-worktree orchestrators (this very repo!) blow past the typical "1 cwd per session" assumption. +- **`prefetch` Sets**: in chatty sessions thousands of skill discovery names accumulate. +- **`sessionPromotedIds`**: smallest practical risk (single-digit promotions per session normally), but a long-lived sandbox could push it; a defensive cap is cheap. + +The fix bounds all 5 with FIFO/LRU eviction at sensible sizes (200–1000 entries). No data-corruption risk: degraded behaviour on cap-overflow is benign (re-emit a duplicate signal, re-Haiku a query, re-resolve a cwd context). Same risk profile as the autonomy stale-recovery design. + +## 4. Fix surface + +| File | Change | +|---|---| +| `src/services/skillSearch/intentNormalize.ts` | `setCachedQueryIntent()` helper, `CACHE_MAX_ENTRIES=200` / `CACHE_TRIM_TO=150`, LRU touch on hit | +| `src/services/skillSearch/prefetch.ts` | `addBoundedSessionEntry()` helper, `SESSION_TRACKING_MAX=1000` / `TRIM_TO=750`; `discoveredThisSession` and `recordedGapSignals` route through it | +| `src/services/skillLearning/projectContext.ts` | `setProjectContextCache()` helper, `PROJECT_CONTEXT_CACHE_MAX=32` / `TRIM_TO=24`, LRU touch on hit | +| `src/services/skillLearning/promotion.ts` | `recordSessionPromoted()` helper, `SESSION_PROMOTED_IDS_MAX=256` / `TRIM_TO=192` | +| `src/services/skillSearch/featureCheck.ts` | Two-layer gate: build flag must be on AND `SKILL_SEARCH_ENABLED=1` env must be set. Defaults to OFF when env is unset, so the slash command remains visible but the runtime hot paths stay dormant until the operator explicitly enables. | +| `src/services/skillLearning/featureCheck.ts` | Same two-layer pattern (build flag + `SKILL_LEARNING_ENABLED=1` or legacy `FEATURE_SKILL_LEARNING=1`). | +| `scripts/defines.ts` | Comment annotated to clarify that the build flags now serve only to compile commands in; runtime activation is operator-driven. | + +## 5. Why default-off (without removing from build)? + +Three reasons aside from the unbounded-cache concern: + +1. **Implicit cost**: `intentNormalize` calls Haiku on cache miss. Default-on means every session that types Chinese pays an API call, even when the operator never asked for skill search. +2. **Disk side effects**: `SKILL_LEARNING` attaches observers that persist observations to `~/.claude` storage. Storage volume should be opt-in, not background. +3. **Experimental status**: the flag is literally named `EXPERIMENTAL_*`. Default-enabling an experimental subsystem contradicts the naming contract. + +**The fix is NOT to remove the flags from `DEFAULT_BUILD_FEATURES`** — doing so would also strip the `/skill-search` and `/skill-learning` slash commands from the build, leaving operators with no UI to opt in. Instead the activation logic in `featureCheck.ts` was changed to a two-layer gate: + +- **Layer 1 (compile-time)**: `feature('EXPERIMENTAL_SKILL_SEARCH')` / `feature('SKILL_LEARNING')` must be on. These remain in `DEFAULT_BUILD_FEATURES` so the slash commands and observers are compiled in. +- **Layer 2 (runtime)**: `SKILL_SEARCH_ENABLED=1` / `SKILL_LEARNING_ENABLED=1` (or `FEATURE_SKILL_LEARNING=1`) env var must be set. Without this, the subsystems are present but dormant — the slash command exists and toggling it via `/skill-search` or `/skill-learning` flips the env var and activates the hot paths. + +Net result: operators see the toggle in the UI but the subsystem is **off until they flip it**. + +## 6. Out of scope (filed for follow-up) + +- **Test failures on CI** (`prefetch.test.ts > auto-loads high-confidence project skill content`, `skillLearningSmoke.test.ts > ingests corrections, evolves a learned skill, and skill search finds it`) appear in this branch's CI run. Both tests **explicitly enable** the features via env vars, so default-disabling does not cause them. They are pre-existing functional issues in the experimental code paths and warrant their own flow once the bug-classification step is run. Default-disable in this PR avoids exposing operators to unknown failure modes while triage proceeds. +- **Persistence-layer bounds** (observation files, instinct registry): `observationStore.ts` already has 30-day purge and 1MB archive thresholds; `skillGapStore.ts` uses a finite-state lifecycle. Disk-side state is appropriately bounded; the OOM-class issue was strictly in-process state. + +## 7. Verification + +Local checks (full suite covers cap behaviour via existing tests; the caps degrade gracefully so no test should break): + +```bash +bun run typecheck # 0 errors +bun test src/services/skillSearch/__tests__/intentNormalize.test.ts +bun test src/services/skillSearch/__tests__/prefetch.extractQuery.test.ts +bun test src/services/skillLearning/__tests__/projectContext.test.ts +bun test src/services/skillLearning/__tests__/promotion.test.ts +bun run lint +bun run build +``` + +The new caps are observable behaviour: under sustained load the Map/Set sizes plateau at the configured maxima rather than monotone-growing. diff --git a/docs/internals/autonomy-jira.md b/docs/internals/autonomy-jira.md new file mode 100644 index 0000000000..5593fdcf9c --- /dev/null +++ b/docs/internals/autonomy-jira.md @@ -0,0 +1,314 @@ +# Autonomy Reliability Jira Drafts + +These tickets are based on the call-chain audit of `/autonomy`, proactive +ticks, HEARTBEAT managed flows, cron scheduling, command queue consumption, +and daemon process supervision. + +## AUT-001: Preserve autonomy lifecycle when queued commands are consumed mid-turn + +Type: Bug +Priority: P0 +Status: Draft +Patch status: Implemented in `fix/autonomy-lifecycle`. + +Problem: +`query.ts` can drain queued prompt/task-notification commands as attachments +during an active turn. Autonomy prompts consumed this way were removed from the +in-memory queue without marking the persisted run as running/completed/failed, +so managed flows could stay stuck in `queued` and never advance. + +Evidence: +- `src/query.ts` drains queued commands via `getCommandsByMaxPriority()`. +- `src/query.ts` removes consumed commands from the queue. +- Lifecycle updates existed only in the normal queued-submit path + `src/utils/handlePromptSubmit.ts` and headless `src/cli/print.ts`. + +Acceptance criteria: +- Mid-turn consumed autonomy commands mark runs `running`. +- Normal query completion finalizes consumed runs and queues next managed-flow + steps. +- Query errors or abort terminal reasons mark consumed runs failed. +- Stale/cancelled autonomy commands are removed from the in-memory queue + without being sent to the model. +- Regression tests cover stale command filtering and managed-flow advancement. + +## AUT-002: Make autonomy run lifecycle transitions terminal-safe + +Type: Bug +Priority: P0 +Status: Draft +Patch status: Implemented in `fix/autonomy-lifecycle`. + +Problem: +Run lifecycle helpers rewrote status unconditionally. A stale in-memory command +could mark a cancelled/completed/failed run back to `running`, causing a +cancelled flow to execute or a terminal flow to be rewritten. + +Evidence: +- `markAutonomyRunRunning`, `markAutonomyRunCompleted`, + `markAutonomyRunFailed`, and `markAutonomyRunCancelled` updated records + without checking current status. +- External CLI cancel cannot remove queued commands living inside another + process, so stale commands are a realistic input. + +Acceptance criteria: +- `queued -> running/completed/failed/cancelled` remains allowed. +- `running -> completed/failed/cancelled` remains allowed. +- Any terminal status rejects later lifecycle updates. +- Rejected transitions do not update managed-flow step state. +- Regression tests cover stale lifecycle calls after cancellation. + +## AUT-003: Prevent proactive and scheduled-task async fire failures from becoming invisible + +Type: Bug +Priority: P1 +Status: Draft +Patch status: Implemented in `fix/autonomy-lifecycle`. + +Problem: +Proactive tick and cron fire callbacks launch detached async work. Failures in +prompt preparation or queue insertion could surface as unhandled rejections or +be lost from diagnostics. In one-shot cron paths, the scheduler has already +decided the task fired. + +Evidence: +- `src/proactive/useProactive.ts` used a detached async IIFE without catch. +- `src/cli/print.ts` proactive and cron paths also detached async work. +- `src/hooks/useScheduledTasks.ts` cron callbacks detached async work. + +Acceptance criteria: +- Detached proactive/cron fire work has explicit error logging. +- REPL proactive tick generation is non-reentrant. +- Tick generation stops queueing after hook unmount. + +## AUT-004: Bound long-running daemon restart timers during shutdown + +Type: Bug +Priority: P1 +Status: Draft +Patch status: Implemented in `fix/autonomy-lifecycle`. + +Problem: +The daemon supervisor scheduled worker restarts with `setTimeout()` but did +not store, clear, or `unref()` the timer. Shutdown during backoff could keep +the supervisor alive until the timer fired, forcing the stop path toward +SIGKILL. + +Evidence: +- `src/daemon/main.ts` scheduled restart timers directly in the worker exit + handler. +- Shutdown only signaled child processes and did not clear restart timers. + +Acceptance criteria: +- Worker restart timers are tracked per worker. +- Shutdown clears any pending restart timers. +- Restart and force-kill grace timers do not keep the supervisor alive alone. + +## AUT-005: Release autonomy persistence lock bookkeeping after each chain + +Type: Bug +Priority: P1 +Status: Draft +Patch status: Implemented in `fix/autonomy-lifecycle`. + +Problem: +`withAutonomyPersistenceLock` stored a chained promise in its map but compared +the map value against the raw current promise during cleanup. That condition +never matched, so root-level lock bookkeeping could accumulate in long-lived +processes that touch many workspaces. + +Evidence: +- `src/utils/autonomyPersistence.ts` stored `previous.then(() => current)`. +- Cleanup compared `persistenceLocks.get(key) === current`. + +Acceptance criteria: +- The stored chained promise is the value used for cleanup comparison. +- Existing serialization behavior for same-root calls remains unchanged. +- Tests directly assert same-root lock bookkeeping returns to zero after both + success and failure. + +## AUT-006: Add active-record protection before persistence truncation + +Type: Reliability +Priority: P2 +Status: Draft +Patch status: Implemented in `fix/autonomy-lifecycle`. + +Problem: +Autonomy runs and flows are capped by latest-created/updated order only. +Under high churn, active `queued` or `running` records can be truncated before +completion, which removes recovery evidence and can break managed-flow +advancement. + +Evidence: +- `src/utils/autonomyRuns.ts` keeps the latest 200 runs by `createdAt`. +- `src/utils/autonomyFlows.ts` keeps the latest 100 flows by `updatedAt`. + +Acceptance criteria: +- Active records are retained before completed historical records are trimmed. +- Tests cover trimming with more than the configured cap and active records + near the tail. + +## AUT-007: Treat provider API-error responses as failed autonomy turns + +Type: Bug +Priority: P0 +Status: Draft +Patch status: Implemented in `fix/autonomy-lifecycle`. + +Problem: +Third-party provider adapters can convert provider failures into synthetic +assistant API-error messages instead of throwing. `query.ts` treated +`isApiErrorMessage` terminal responses as `completed`, so an autonomy command +that had already been consumed as a queued attachment could be marked +completed and advance its managed flow even though the provider call failed. + +Evidence: +- `src/services/api/openai/index.ts`, `src/services/api/gemini/index.ts`, and + `src/services/api/grok/index.ts` yield `createAssistantAPIErrorMessage()` on + adapter errors. +- `src/query.ts` skipped stop hooks for API-error assistant messages but + returned `reason: 'completed'`. +- Top-level autonomy finalization used terminal completion to decide whether + to mark consumed runs completed or failed. + +Acceptance criteria: +- Provider API-error assistant messages terminate the query with + `reason: 'model_error'`. +- Any consumed autonomy run is marked failed rather than completed. +- Managed flows do not advance to the next step after provider API errors. +- A regression test simulates provider error after a queued autonomy attachment + has been consumed. + +## AUT-008: Finalize consumed autonomy runs on async-generator close + +Type: Bug +Priority: P0 +Status: Draft +Patch status: Implemented in `fix/autonomy-lifecycle`. + +Problem: +`query()` is an async generator. When its consumer calls `.return()` or breaks +out of iteration, JavaScript executes `finally` blocks and skips code after the +`try/finally`. The previous autonomy finalization ran after the `finally`, so +queued autonomy commands that had already been claimed as `running` could stay +persisted as `running` forever if the REPL/SDK consumer closed the generator. + +Evidence: +- Claimed run IDs were collected during queued attachment injection. +- Completion/failure finalization happened only after `yield* queryLoop(...)` + returned normally or threw. +- Claude cross-validation flagged this as a durable run/flow leak. + +Acceptance criteria: +- Consumed autonomy runs are finalized from a `finally` path. +- Normal completion marks consumed runs completed and enqueues next managed + flow steps. +- Provider/model errors mark consumed runs failed. +- Generator close and user abort terminals mark consumed runs cancelled. +- A regression test closes the generator after a queued autonomy attachment and + verifies the run/flow are cancelled, not left running. + +## AUT-009: Claim queued autonomy runs before attachment injection + +Type: Bug +Priority: P0 +Status: Draft +Patch status: Implemented in `fix/autonomy-lifecycle`. + +Problem: +The query loop filtered stale queued autonomy commands before attachment +generation, but it did not claim runs as `running` until after attachments were +already yielded. A concurrent cancellation between those steps could still send +a cancelled prompt into the model context. + +Evidence: +- `partitionConsumableQueuedAutonomyCommands()` only checked persisted status. +- `markAutonomyRunRunning()` previously ran after `getAttachmentMessages()`. +- Reviewer cross-validation identified the check-then-act race. + +Acceptance criteria: +- Query claims queued autonomy runs before passing commands to attachment + generation. +- Only successfully claimed commands are injected as queued-command + attachments. +- Failed claims are treated as stale and removed from the in-memory queue. +- Claiming reads persisted run state once per turn rather than once per + command. + +## AUT-010: Cancel proactive and cron runs dropped before enqueue + +Type: Bug +Priority: P1 +Status: Draft +Patch status: Implemented in `fix/autonomy-lifecycle`. + +Problem: +`/proactive` and scheduled-task producers persist autonomy runs before +returning queue commands. If the component is disposed or headless input closes +after persistence but before enqueue, the queued run is left on disk with no +in-memory command to consume it. + +Evidence: +- `createProactiveAutonomyCommands()` commits runs before returning commands. +- `commitAutonomyQueuedPrompt()` persists scheduled-task runs before callers + enqueue them. +- Callers checked `disposed` / `inputClosed` after command creation and could + return without terminalizing the run. + +Acceptance criteria: +- Proactive hook cancellation checks run both before commit and after command + creation. +- Headless proactive and cron paths cancel any already-created command that is + dropped due to input close. +- REPL scheduled-task cleanup cancels already-created commands when unmounted. +- A regression test verifies a proactive command created but dropped before + enqueue is marked cancelled. + +## AUT-011: Replace query transition `any` stubs with typed contracts + +Type: Test/Type Safety +Priority: P2 +Status: Draft +Patch status: Implemented in `fix/autonomy-lifecycle`. + +Problem: +`src/query/transitions.ts` defined both `Terminal` and `Continue` as `any`. +That allowed new terminal reasons such as `model_error` and continuation +reasons such as `collapse_drain_retry` to drift without compiler checks. + +Evidence: +- Claude cross-validation flagged the `Terminal = any` contract as a remaining + issue. +- Tightening the type immediately caught that + `collapse_drain_retry.committed` is a `number`, not a `boolean`. + +Acceptance criteria: +- `Terminal` is a concrete union of query terminal reasons. +- `Continue` is a concrete union of continuation reasons and payloads. +- `bun run typecheck` validates all query return sites against that contract. + +## AUT-012: Avoid provider test settings-module mock pollution + +Type: Test Reliability +Priority: P2 +Status: Draft +Patch status: Implemented in `fix/autonomy-lifecycle`. + +Problem: +The provider tests previously mocked `settings.js`. A minimal mock broke other +tests that imported additional settings exports in the same Bun process; the +expanded mock avoided the failure but over-coupled the provider test to +unrelated settings internals. + +Evidence: +- Full test runs observed cross-file settings mock pollution. +- `src/utils/model/providers.ts` only needs the real `getInitialSettings()` + behavior. + +Acceptance criteria: +- Provider tests do not mock `settings.js`. +- `modelType` precedence is exercised through an injected settings snapshot, + leaving global bootstrap state untouched. +- Provider tests pass when run alongside permissions tests and the provider + matrix. diff --git a/packages/builtin-tools/src/tools/RemoteTriggerTool/__tests__/RemoteTriggerTool.test.ts b/packages/builtin-tools/src/tools/RemoteTriggerTool/__tests__/RemoteTriggerTool.test.ts index eb9b726c82..b3640822f4 100644 --- a/packages/builtin-tools/src/tools/RemoteTriggerTool/__tests__/RemoteTriggerTool.test.ts +++ b/packages/builtin-tools/src/tools/RemoteTriggerTool/__tests__/RemoteTriggerTool.test.ts @@ -1,19 +1,8 @@ import { afterEach, beforeEach, describe, expect, mock, test } from 'bun:test' -import { mkdir, readFile, rm } from 'fs/promises' -import { tmpdir } from 'os' -import { join } from 'path' -import { - resetStateForTests, - setOriginalCwd, - setProjectRoot, -} from 'src/bootstrap/state.js' -import { logMock } from '../../../../../../tests/mocks/log' -import { debugMock } from '../../../../../../tests/mocks/debug' +import { authMock } from '../../../../../../tests/mocks/auth' let requestStatus = 200 - -mock.module('src/utils/log.ts', logMock) -mock.module('src/utils/debug.ts', debugMock) +const auditRecords: Record[] = [] mock.module('axios', () => ({ default: { @@ -24,20 +13,12 @@ mock.module('axios', () => ({ }, })) -mock.module('src/utils/auth.js', () => ({ - checkAndRefreshOAuthTokenIfNeeded: async () => {}, - getClaudeAIOAuthTokens: () => ({ accessToken: 'token' }), -})) +mock.module('src/utils/auth.js', authMock) mock.module('src/services/oauth/client.js', () => ({ getOrganizationUUID: async () => 'org', })) -mock.module('src/constants/oauth.js', () => ({ - getOauthConfig: () => ({ BASE_API_URL: 'https://example.test' }), - fileSuffixForOauthConfig: () => '', -})) - mock.module('src/services/analytics/growthbook.js', () => ({ getFeatureValue_CACHED_MAY_BE_STALE: () => true, })) @@ -46,40 +27,41 @@ mock.module('src/services/policyLimits/index.js', () => ({ isPolicyAllowed: () => true, })) -mock.module('bun:bundle', () => ({ - feature: () => false, -})) - -let cwd = '' -let previousCwd = '' -let auditRecords: Array> = [] +// Narrow mock for the side-effectful entries in `src/constants/oauth.js`. +// Pure data exports (ALL_OAUTH_SCOPES, CLAUDE_AI_*_SCOPE, etc.) come from +// the real module and are not mocked, per the test policy that constants +// modules without side effects should not be replaced wholesale. +mock.module('src/constants/oauth.js', () => { + const actual = require('../../../../../../src/constants/oauth.js') + return { + ...actual, + fileSuffixForOauthConfig: () => '', + getOauthConfig: () => ({ BASE_API_URL: 'https://example.test' }), + MCP_CLIENT_METADATA_URL: 'https://example.test/oauth/metadata', + } +}) mock.module('src/utils/remoteTriggerAudit.js', () => ({ - appendRemoteTriggerAuditRecord: async (record: Record) => { - const full = { ...record, auditId: record.auditId ?? 'test-audit-id', createdAt: Date.now() } - auditRecords.push(full) - return full + appendRemoteTriggerAuditRecord: async ( + record: Record, + ) => { + const fullRecord = { + auditId: `audit-${auditRecords.length + 1}`, + createdAt: Date.now(), + ...record, + } + auditRecords.push(fullRecord) + return fullRecord }, - resolveRemoteTriggerAuditPath: () => join(cwd, '.claude', 'remote-trigger-audit.jsonl'), })) -beforeEach(async () => { +beforeEach(() => { requestStatus = 200 - auditRecords = [] - previousCwd = process.cwd() - cwd = join(tmpdir(), `remote-trigger-tool-${Date.now()}-${Math.random().toString(16).slice(2)}`) - await mkdir(cwd, { recursive: true }) - await mkdir(join(cwd, '.claude'), { recursive: true }) - process.chdir(cwd) - resetStateForTests() - setOriginalCwd(cwd) - setProjectRoot(cwd) + auditRecords.length = 0 }) -afterEach(async () => { - resetStateForTests() - process.chdir(previousCwd) - await rm(cwd, { recursive: true, force: true }) +afterEach(() => { + auditRecords.length = 0 }) describe('RemoteTriggerTool audit', () => { @@ -91,10 +73,14 @@ describe('RemoteTriggerTool audit', () => { ) expect(result.data.audit_id).toBeString() + expect(result.data.audit_id).toBe('audit-1') expect(auditRecords).toHaveLength(1) - expect(auditRecords[0].action).toBe('run') - expect(auditRecords[0].triggerId).toBe('trigger-1') - expect(auditRecords[0].ok).toBe(true) + expect(auditRecords[0]).toMatchObject({ + action: 'run', + triggerId: 'trigger-1', + ok: true, + status: 200, + }) }) test('writes an audit record before rethrowing validation failures', async () => { @@ -108,8 +94,10 @@ describe('RemoteTriggerTool audit', () => { ).rejects.toThrow('run requires trigger_id') expect(auditRecords).toHaveLength(1) - expect(auditRecords[0].action).toBe('run') - expect(auditRecords[0].ok).toBe(false) - expect(auditRecords[0].error).toBe('run requires trigger_id') + expect(auditRecords[0]).toMatchObject({ + action: 'run', + ok: false, + error: 'run requires trigger_id', + }) }) }) diff --git a/packages/color-diff-napi/src/index.ts b/packages/color-diff-napi/src/index.ts index 9fe5240ede..692728e2a9 100644 --- a/packages/color-diff-napi/src/index.ts +++ b/packages/color-diff-napi/src/index.ts @@ -18,76 +18,19 @@ */ import { diffArrays } from 'diff' -// Import the minimal highlight.js core (no languages) instead of the full -// bundle that loads 190+ grammars (~5-15MB). Individual languages are -// imported statically below and registered on the core instance. Static -// imports work in Bun --compile mode (only createRequire fails). -import hljs from 'highlight.js/lib/core' +import hljs from 'highlight.js' import { basename, extname } from 'path' -// --- Register commonly-used languages (~25 instead of 190+) --- -import langBash from 'highlight.js/lib/languages/bash' -import langC from 'highlight.js/lib/languages/c' -import langCmake from 'highlight.js/lib/languages/cmake' -import langCpp from 'highlight.js/lib/languages/cpp' -import langCsharp from 'highlight.js/lib/languages/csharp' -import langCss from 'highlight.js/lib/languages/css' -import langDiff from 'highlight.js/lib/languages/diff' -import langDockerfile from 'highlight.js/lib/languages/dockerfile' -import langGo from 'highlight.js/lib/languages/go' -import langGraphQL from 'highlight.js/lib/languages/graphql' -import langJava from 'highlight.js/lib/languages/java' -import langJavaScript from 'highlight.js/lib/languages/javascript' -import langJson from 'highlight.js/lib/languages/json' -import langKotlin from 'highlight.js/lib/languages/kotlin' -import langMakefile from 'highlight.js/lib/languages/makefile' -import langMarkdown from 'highlight.js/lib/languages/markdown' -import langPerl from 'highlight.js/lib/languages/perl' -import langPhp from 'highlight.js/lib/languages/php' -import langPython from 'highlight.js/lib/languages/python' -import langRuby from 'highlight.js/lib/languages/ruby' -import langRust from 'highlight.js/lib/languages/rust' -import langShell from 'highlight.js/lib/languages/shell' -import langSql from 'highlight.js/lib/languages/sql' -import langTypeScript from 'highlight.js/lib/languages/typescript' -import langXml from 'highlight.js/lib/languages/xml' -import langYaml from 'highlight.js/lib/languages/yaml' - -hljs.registerLanguage('bash', langBash) -hljs.registerLanguage('c', langC) -hljs.registerLanguage('cmake', langCmake) -hljs.registerLanguage('cpp', langCpp) -hljs.registerLanguage('csharp', langCsharp) -hljs.registerLanguage('css', langCss) -hljs.registerLanguage('diff', langDiff) -hljs.registerLanguage('dockerfile', langDockerfile) -hljs.registerLanguage('go', langGo) -hljs.registerLanguage('graphql', langGraphQL) -hljs.registerLanguage('java', langJava) -hljs.registerLanguage('javascript', langJavaScript) -hljs.registerLanguage('json', langJson) -hljs.registerLanguage('kotlin', langKotlin) -hljs.registerLanguage('makefile', langMakefile) -hljs.registerLanguage('markdown', langMarkdown) -hljs.registerLanguage('perl', langPerl) -hljs.registerLanguage('php', langPhp) -hljs.registerLanguage('python', langPython) -hljs.registerLanguage('ruby', langRuby) -hljs.registerLanguage('rust', langRust) -hljs.registerLanguage('shell', langShell) -hljs.registerLanguage('sql', langSql) -hljs.registerLanguage('typescript', langTypeScript) -hljs.registerLanguage('xml', langXml) -hljs.registerLanguage('yaml', langYaml) -// JavaScript grammar also handles .mjs/.cjs extensions -// TypeScript grammar also handles .tsx via auto-detection - +// Static import — createRequire(import.meta.url) fails in Bun --compile mode +// because the resolved path points to the internal bunfs binary path where +// node_modules cannot be found. A top-level import ensures the module is +// bundled and accessible at runtime. type HLJSApi = typeof hljs let cachedHljs: HLJSApi | null = null function hljsApi(): HLJSApi { if (cachedHljs) return cachedHljs - // highlight.js/lib/core uses `export =` (CJS). Under bun/ESM the interop - // wraps it in .default; under node CJS the module IS the API. Check at runtime. + // highlight.js uses `export =` (CJS). Under bun/ESM the interop wraps it + // in .default; under node CJS the module IS the API. Check at runtime. const mod = hljs as HLJSApi & { default?: HLJSApi } cachedHljs = 'default' in mod && mod.default ? mod.default : mod return cachedHljs! diff --git a/scripts/defines.ts b/scripts/defines.ts index 1cff1337a6..7c482f31cf 100644 --- a/scripts/defines.ts +++ b/scripts/defines.ts @@ -66,9 +66,16 @@ export const DEFAULT_BUILD_FEATURES = [ 'COMMIT_ATTRIBUTION', // Git 提交归属追踪(记录 AI 辅助贡献) // Server mode (claude server / claude open) 'DIRECT_CONNECT', // 直连模式(claude server / claude open) - // Skill search & learning - 'EXPERIMENTAL_SKILL_SEARCH', // 实验性技能搜索(DiscoverSkills) - // 'SKILL_LEARNING', // projectContext cache 无淘汰机制(非 GB 级主因) + // Skill search & learning — feature flags compiled in (so the slash + // commands /skill-* etc. exist), but the runtime "enabled" toggle + // defaults to OFF (see featureCheck.ts). Operators turn on via the + // slash-command toggle or env vars (SKILL_SEARCH_ENABLED=1, + // SKILL_LEARNING_ENABLED=1). Rationale: bounded caches added on + // this branch (see docs/agent/sur-skill-overflow-bugs.md) close the + // overflow risk, but Haiku-on-first-Chinese-query and disk-side + // observation accumulation remain operator-discretion concerns. + 'EXPERIMENTAL_SKILL_SEARCH', + 'SKILL_LEARNING', // P3: poor mode 'POOR', // 穷鬼模式,跳过 extract_memories/prompt_suggestion 减少消耗 // Team Memory diff --git a/src/Tool.ts b/src/Tool.ts index c8c7a98956..6008807511 100644 --- a/src/Tool.ts +++ b/src/Tool.ts @@ -178,6 +178,19 @@ export type ToolUseContext = { querySource?: QuerySource /** Optional callback to get the latest tools (e.g., after MCP servers connect mid-query) */ refreshTools?: () => Tools + /** + * @internal TEST-ONLY ESCAPE HATCH. MUST remain undefined in production. + * + * Allows non-bundled unit-test harnesses to exercise the background + * forked slash command path that production assistant mode gates behind + * `feature('KAIROS')`. Still requires `AppState.kairosEnabled`. This + * field is constructed in-process by trusted application code only; + * no external surface (MCP, plugin, slash command, network) writes to + * `ToolUseContext.options`. Setting this true outside a test bypasses + * the KAIROS feature flag; `processSlashCommand` rejects this flag + * outside `NODE_ENV=test`. + */ + allowBackgroundForkedSlashCommands?: boolean } abortController: AbortController readFileState: FileStateCache diff --git a/src/__tests__/handlePromptSubmit.test.ts b/src/__tests__/handlePromptSubmit.test.ts index 7fa2f663d2..1c0cca36fb 100644 --- a/src/__tests__/handlePromptSubmit.test.ts +++ b/src/__tests__/handlePromptSubmit.test.ts @@ -1,8 +1,18 @@ -import { beforeEach, describe, expect, mock, test } from 'bun:test' +import { afterEach, beforeEach, describe, expect, mock, test } from 'bun:test' import { createAbortController } from '../utils/abortController' import { QueryGuard } from '../utils/QueryGuard' import { handlePromptSubmit } from '../utils/handlePromptSubmit' -import { getCommandQueue, resetCommandQueue } from '../utils/messageQueueManager' +import { + getCommandQueue, + resetCommandQueue, +} from '../utils/messageQueueManager' +import { cleanupTempDir, createTempDir } from '../../tests/mocks/file-system' +import { + createAutonomyQueuedPrompt, + markAutonomyRunCancelled, +} from '../utils/autonomyRuns' + +let tempDirs: string[] = [] function createBaseParams() { const queryGuard = new QueryGuard() @@ -28,11 +38,9 @@ function createBaseParams() { commands: [], setUserInputOnProcessing: mock((_prompt?: string) => {}), setAbortController: mock((_abortController: AbortController | null) => {}), - onQuery: mock( - async () => undefined, - ) as unknown as ( + onQuery: mock(async () => true) as unknown as ( ...args: unknown[] - ) => Promise, + ) => Promise, setAppState: mock((_updater: unknown) => {}), } } @@ -40,6 +48,13 @@ function createBaseParams() { describe('handlePromptSubmit', () => { beforeEach(() => { resetCommandQueue() + tempDirs = [] + }) + + afterEach(async () => { + for (const tempDir of tempDirs) { + await cleanupTempDir(tempDir) + } }) test('aborts the current turn when only cancel-interrupt tools are running', async () => { @@ -118,4 +133,34 @@ describe('handlePromptSubmit', () => { bridgeOrigin: true, }) }) + + test('skips stale autonomy commands in the idle queued path', async () => { + const params = createBaseParams() + const abortController = createAbortController() + const tempDir = await createTempDir('handle-prompt-autonomy-') + tempDirs.push(tempDir) + const command = await createAutonomyQueuedPrompt({ + basePrompt: 'scheduled prompt', + trigger: 'scheduled-task', + rootDir: tempDir, + currentDir: tempDir, + }) + expect(command).not.toBeNull() + await markAutonomyRunCancelled(command!.autonomy!.runId, tempDir) + + await handlePromptSubmit({ + ...params, + input: '', + mode: 'prompt', + pastedContents: {}, + abortController, + streamMode: 'normal' as any, + hasInterruptibleToolInProgress: false, + isExternalLoading: false, + queuedCommands: [command!], + }) + + expect(params.getToolUseContext).not.toHaveBeenCalled() + expect(params.onQuery).not.toHaveBeenCalled() + }) }) diff --git a/src/__tests__/queryAutonomyProviderBoundary.test.ts b/src/__tests__/queryAutonomyProviderBoundary.test.ts new file mode 100644 index 0000000000..5da040c13b --- /dev/null +++ b/src/__tests__/queryAutonomyProviderBoundary.test.ts @@ -0,0 +1,337 @@ +import { afterEach, beforeEach, describe, expect, test } from 'bun:test' +import { randomUUID } from 'crypto' +import { + resetStateForTests, + setCwdState, + setOriginalCwd, + setProjectRoot, +} from '../bootstrap/state' +import { query } from '../query' +import { getEmptyToolPermissionContext } from '../Tool' +import type { AssistantMessage } from '../types/message' +import { asSystemPrompt } from '../utils/systemPromptType' +import { + createAssistantAPIErrorMessage, + createUserMessage, +} from '../utils/messages' +import { cleanupTempDir, createTempDir } from '../../tests/mocks/file-system' +import { + enqueue, + getCommandsByMaxPriority, + resetCommandQueue, +} from '../utils/messageQueueManager' +import { getAutonomyFlowById, listAutonomyFlows } from '../utils/autonomyFlows' +import { + getAutonomyRunById, + startManagedAutonomyFlowFromHeartbeatTask, +} from '../utils/autonomyRuns' + +let tempDir = '' +let originalProcessCwd = '' + +beforeEach(async () => { + originalProcessCwd = process.cwd() + tempDir = await createTempDir('query-autonomy-provider-boundary-') + resetStateForTests() + resetCommandQueue() + setOriginalCwd(tempDir) + setCwdState(tempDir) + setProjectRoot(tempDir) +}) + +afterEach(async () => { + resetStateForTests() + resetCommandQueue() + if (originalProcessCwd) { + process.chdir(originalProcessCwd) + } + if (tempDir) { + let lastError: unknown + for (let attempt = 0; attempt < 20; attempt++) { + try { + await cleanupTempDir(tempDir) + lastError = undefined + break + } catch (error) { + lastError = error + await new Promise(resolve => setTimeout(resolve, 100)) + } + } + if (lastError) { + throw lastError + } + } +}) + +function createToolUseAssistantMessage(): AssistantMessage { + return { + type: 'assistant', + uuid: randomUUID(), + timestamp: new Date().toISOString(), + requestId: undefined, + message: { + id: 'msg_tool_use', + type: 'message', + role: 'assistant', + model: 'test-model', + stop_reason: 'tool_use', + stop_sequence: null, + usage: { + input_tokens: 1, + output_tokens: 1, + cache_creation_input_tokens: 0, + cache_read_input_tokens: 0, + }, + content: [ + { + type: 'tool_use', + id: 'toolu_provider_boundary', + name: 'MissingBoundaryTool', + input: {}, + }, + ], + }, + } as unknown as AssistantMessage +} + +function createToolUseContext(): any { + let inProgressToolUseIds = new Set() + let responseLength = 0 + let appState = { + toolPermissionContext: getEmptyToolPermissionContext(), + fastMode: false, + mcp: { + tools: [], + clients: [], + }, + effortValue: undefined, + advisorModel: undefined, + sessionHooks: new Map(), + } + + return { + options: { + commands: [], + debug: false, + mainLoopModel: 'claude-sonnet-4-5-20250929', + tools: [], + verbose: false, + thinkingConfig: { type: 'disabled' }, + mcpClients: [], + mcpResources: {}, + isNonInteractiveSession: true, + agentDefinitions: { + activeAgents: [], + allowedAgentTypes: [], + }, + }, + abortController: new AbortController(), + readFileState: new Map(), + getAppState: () => appState, + setAppState: (updater: (state: any) => any) => { + appState = updater(appState as never) + }, + setInProgressToolUseIDs: (updater: (state: Set) => Set) => { + inProgressToolUseIds = updater(inProgressToolUseIds) + }, + setResponseLength: (updater: (state: number) => number) => { + responseLength = updater(responseLength) + }, + updateFileHistoryState: () => {}, + updateAttributionState: () => {}, + messages: [], + } as any +} + +describe('query autonomy/provider boundary', () => { + test('provider api-error messages fail a consumed autonomy run instead of advancing the flow', async () => { + const previousDisableAttachments = + process.env.CLAUDE_CODE_DISABLE_ATTACHMENTS + process.env.CLAUDE_CODE_DISABLE_ATTACHMENTS = '1' + try { + const command = await startManagedAutonomyFlowFromHeartbeatTask({ + task: { + name: 'provider-boundary', + interval: '1h', + prompt: 'Exercise provider boundary', + steps: [ + { name: 'first', prompt: 'First provider-boundary step' }, + { name: 'second', prompt: 'Second provider-boundary step' }, + ], + }, + rootDir: tempDir, + currentDir: tempDir, + priority: 'next', + }) + expect(command).not.toBeNull() + enqueue(command!) + + const toolUseContext = createToolUseContext() + + let callCount = 0 + const deps = { + uuid: () => 'query-chain-id', + microcompact: async (messages: unknown[]) => ({ messages }), + autocompact: async () => ({ + compactionResult: undefined, + consecutiveFailures: 0, + }), + callModel: async function* () { + callCount += 1 + if (callCount === 1) { + yield createToolUseAssistantMessage() + return + } + yield createAssistantAPIErrorMessage({ + content: 'API Error: provider unavailable', + apiError: 'api_error', + error: new Error('provider unavailable') as never, + }) + }, + } + + const emitted: any[] = [] + const generator = query({ + messages: [ + createUserMessage({ + content: 'start provider-boundary test', + }), + ], + systemPrompt: asSystemPrompt([]), + userContext: {}, + systemContext: {}, + canUseTool: async (_tool, input) => ({ + behavior: 'allow', + updatedInput: input, + }), + toolUseContext, + querySource: 'sdk', + maxTurns: 3, + deps: deps as never, + }) + let next = await generator.next() + while (!next.done) { + emitted.push(next.value) + next = await generator.next() + } + + const [flow] = await listAutonomyFlows(tempDir) + const finalFlow = await getAutonomyFlowById(flow!.flowId, tempDir) + const run = await getAutonomyRunById(command!.autonomy!.runId, tempDir) + + expect(next.value.reason).toBe('model_error') + expect(callCount).toBe(2) + expect( + emitted.some( + message => + message.type === 'attachment' && + message.attachment.type === 'queued_command', + ), + ).toBe(true) + expect(run!.status).toBe('failed') + expect(run!.error).toBe('provider api_error') + expect(finalFlow!.status).toBe('failed') + expect(finalFlow!.stateJson!.steps.map(step => step.status)).toEqual([ + 'failed', + 'pending', + ]) + expect(getCommandsByMaxPriority('later')).toHaveLength(0) + } finally { + if (previousDisableAttachments === undefined) { + delete process.env.CLAUDE_CODE_DISABLE_ATTACHMENTS + } else { + process.env.CLAUDE_CODE_DISABLE_ATTACHMENTS = previousDisableAttachments + } + } + }) + + test('generator return cancels a consumed autonomy run instead of leaving it running', async () => { + const previousDisableAttachments = + process.env.CLAUDE_CODE_DISABLE_ATTACHMENTS + process.env.CLAUDE_CODE_DISABLE_ATTACHMENTS = '1' + try { + const command = await startManagedAutonomyFlowFromHeartbeatTask({ + task: { + name: 'return-boundary', + interval: '1h', + prompt: 'Exercise generator return boundary', + steps: [ + { name: 'first', prompt: 'First return-boundary step' }, + { name: 'second', prompt: 'Second return-boundary step' }, + ], + }, + rootDir: tempDir, + currentDir: tempDir, + priority: 'next', + }) + expect(command).not.toBeNull() + enqueue(command!) + + const toolUseContext = createToolUseContext() + const deps = { + uuid: () => 'query-chain-id', + microcompact: async (messages: unknown[]) => ({ messages }), + autocompact: async () => ({ + compactionResult: undefined, + consecutiveFailures: 0, + }), + callModel: async function* () { + yield createToolUseAssistantMessage() + }, + } + + const generator = query({ + messages: [ + createUserMessage({ + content: 'start return-boundary test', + }), + ], + systemPrompt: asSystemPrompt([]), + userContext: {}, + systemContext: {}, + canUseTool: async (_tool, input) => ({ + behavior: 'allow', + updatedInput: input, + }), + toolUseContext, + querySource: 'sdk', + maxTurns: 3, + deps: deps as never, + }) + + let sawQueuedAttachment = false + let next = await generator.next() + while (!next.done) { + const message = next.value as any + if ( + message.type === 'attachment' && + message.attachment.type === 'queued_command' + ) { + sawQueuedAttachment = true + await generator.return(undefined as never) + break + } + next = await generator.next() + } + + const [flow] = await listAutonomyFlows(tempDir) + const finalFlow = await getAutonomyFlowById(flow!.flowId, tempDir) + const run = await getAutonomyRunById(command!.autonomy!.runId, tempDir) + + expect(sawQueuedAttachment).toBe(true) + expect(run!.status).toBe('cancelled') + expect(finalFlow!.status).toBe('cancelled') + expect(finalFlow!.stateJson!.steps.map(step => step.status)).toEqual([ + 'cancelled', + 'cancelled', + ]) + expect(getCommandsByMaxPriority('later')).toHaveLength(0) + } finally { + if (previousDisableAttachments === undefined) { + delete process.env.CLAUDE_CODE_DISABLE_ATTACHMENTS + } else { + process.env.CLAUDE_CODE_DISABLE_ATTACHMENTS = previousDisableAttachments + } + } + }) +}) diff --git a/src/cli/print.ts b/src/cli/print.ts index c4e8c45697..8b0aedc462 100644 --- a/src/cli/print.ts +++ b/src/cli/print.ts @@ -321,16 +321,15 @@ import { } from 'src/utils/queryProfiler.js' import { asSessionId } from 'src/types/ids.js' import { - commitAutonomyQueuedPrompt, - createAutonomyQueuedPrompt, + createAutonomyQueuedPromptIfNoActiveSource, createProactiveAutonomyCommands, - finalizeAutonomyRunCompleted, - finalizeAutonomyRunFailed, - markAutonomyRunCompleted, markAutonomyRunFailed, - markAutonomyRunRunning, } from 'src/utils/autonomyRuns.js' -import { prepareAutonomyTurnPrompt } from 'src/utils/autonomyAuthority.js' +import { + cancelQueuedAutonomyCommands, + claimConsumableQueuedAutonomyCommands, + finalizeAutonomyCommandsForTurn, +} from 'src/utils/autonomyQueueLifecycle.js' import { jsonStringify } from '../utils/slowOperations.js' import { skillChangeDetector } from '../utils/skills/skillChangeDetector.js' import { getCommands, clearCommandsCache } from '../commands.js' @@ -1865,17 +1864,26 @@ function runHeadlessStreaming( currentDir: cwd(), shouldCreate: () => !inputClosed, }) + if (inputClosed) { + await cancelQueuedAutonomyCommands({ commands }) + return + } for (const command of commands) { - if (inputClosed) { - return - } enqueue({ ...command, uuid: randomUUID(), }) } void run() - })() + })().catch(error => { + logError(error) + logForDebugging( + `[Proactive] failed to create headless tick: ${error}`, + { + level: 'error', + }, + ) + }) }, 0) } : undefined @@ -1971,17 +1979,24 @@ function runHeadlessStreaming( // Non-prompt commands (task-notification, orphaned-permission) carry // side effects or orphanedPermission state, so they process singly. // Prompt commands greedily collect followers with matching workload. - const batch: QueuedCommand[] = [command] + let batch: QueuedCommand[] = [command] if (command.mode === 'prompt') { while (canBatchWith(command, peek(isMainThread))) { batch.push(dequeue(isMainThread)!) } - if (batch.length > 1) { - command = { - ...command, - value: joinPromptValues(batch.map(c => c.value)), - uuid: batch.findLast(c => c.uuid)?.uuid ?? command.uuid, - } + } + const queuedAutonomyClaim = + await claimConsumableQueuedAutonomyCommands(batch) + batch = queuedAutonomyClaim.attachmentCommands + if (batch.length === 0) { + continue + } + command = batch[0]! + if (command.mode === 'prompt' && batch.length > 1) { + command = { + ...command, + value: joinPromptValues(batch.map(c => c.value)), + uuid: batch.findLast(c => c.uuid)?.uuid ?? command.uuid, } } const batchUuids = batch.map(c => c.uuid).filter(u => u !== undefined) @@ -2120,9 +2135,7 @@ function runHeadlessStreaming( } const input = command.value - const autonomyRunIds = batch - .map(item => item.autonomy?.runId) - .filter((runId): runId is string => Boolean(runId)) + const claimedAutonomyCommands = queuedAutonomyClaim.claimedCommands if (structuredIO instanceof RemoteIO && command.mode === 'prompt') { logEvent('tengu_bridge_message_received', { @@ -2172,9 +2185,6 @@ function runHeadlessStreaming( // const-capture: TS loses `while ((command = dequeue()))` narrowing // inside the closure. const cmd = command - for (const runId of autonomyRunIds) { - await markAutonomyRunRunning(runId) - } let lastResultIsError = false try { await runWithWorkload( @@ -2286,35 +2296,39 @@ function runHeadlessStreaming( }, ) // end runWithWorkload if (lastResultIsError) { - for (const runId of autonomyRunIds) { - await finalizeAutonomyRunFailed({ - runId, - error: 'ask() returned an error result', - }) - } + await finalizeAutonomyCommandsForTurn({ + commands: claimedAutonomyCommands, + outcome: { + type: 'failed', + message: 'ask() returned an error result', + }, + currentDir: cwd(), + priority: 'later', + workload: cmd.workload ?? options.workload, + }) } else { - for (const runId of autonomyRunIds) { - const nextCommands = await finalizeAutonomyRunCompleted({ - runId, - currentDir: cwd(), - priority: 'later', - workload: cmd.workload ?? options.workload, + const nextCommands = await finalizeAutonomyCommandsForTurn({ + commands: claimedAutonomyCommands, + outcome: { type: 'completed' }, + currentDir: cwd(), + priority: 'later', + workload: cmd.workload ?? options.workload, + }) + for (const nextCommand of nextCommands) { + enqueue({ + ...nextCommand, + uuid: randomUUID(), }) - for (const nextCommand of nextCommands) { - enqueue({ - ...nextCommand, - uuid: randomUUID(), - }) - } } } } catch (error) { - for (const runId of autonomyRunIds) { - await finalizeAutonomyRunFailed({ - runId, - error: String(error), - }) - } + await finalizeAutonomyCommandsForTurn({ + commands: claimedAutonomyCommands, + outcome: { type: 'failed', error }, + currentDir: cwd(), + priority: 'later', + workload: cmd.workload ?? options.workload, + }) throw error } @@ -2805,72 +2819,90 @@ function runHeadlessStreaming( let cronScheduler: import('../utils/cronScheduler.js').CronScheduler | null = null if (cronGate.isKairosCronEnabled()) { + // Shared dedup-claim → input-close-recheck → onSuccess pipeline for the + // three cron entry points (legacy onFire, onFireTask agent, onFireTask + // non-agent). Centralizing the cancel-on-late-shutdown contract here keeps + // the three branches from drifting on what happens between claim and + // dispatch. onSuccess receives the claimed QueuedCommand and decides + // whether to enqueue it (normal path) or mark the run failed (agent path). + const dispatchHeadlessCronCommand = (params: { + basePrompt: string + sourceId: string + sourceLabel: string + logSuffix: string + onSuccess: (command: QueuedCommand) => void | Promise + }): void => { + if (inputClosed) return + void (async () => { + const command = await createAutonomyQueuedPromptIfNoActiveSource({ + basePrompt: params.basePrompt, + trigger: 'scheduled-task', + currentDir: cwd(), + sourceId: params.sourceId, + sourceLabel: params.sourceLabel, + workload: WORKLOAD_CRON, + shouldCreate: () => !inputClosed, + }) + if (!command) return + if (inputClosed) { + await cancelQueuedAutonomyCommands({ commands: [command] }) + return + } + await params.onSuccess(command) + })().catch(error => { + logError(error) + logForDebugging( + `[ScheduledTasks] failed to enqueue headless task${params.logSuffix}: ${error}`, + { level: 'error' }, + ) + }) + } + + const enqueueAndRun = (command: QueuedCommand): void => { + enqueue({ + ...command, + uuid: randomUUID(), + }) + void run() + } + cronScheduler = cronSchedulerModule.createCronScheduler({ onFire: prompt => { - if (inputClosed) return - void (async () => { - const prepared = await prepareAutonomyTurnPrompt({ - basePrompt: prompt, - trigger: 'scheduled-task', - currentDir: cwd(), - }) - if (inputClosed) return - const command = await commitAutonomyQueuedPrompt({ - prepared, - currentDir: cwd(), - workload: WORKLOAD_CRON, - }) - if (inputClosed) return - enqueue({ - ...command, - uuid: randomUUID(), - }) - void run() - })() + // Legacy KAIROS-style entries: the prompt text is what uniquely + // identifies the cron entry, so it doubles as both source id and + // source label for dedup. + dispatchHeadlessCronCommand({ + basePrompt: prompt, + sourceId: prompt, + sourceLabel: prompt, + logSuffix: '', + onSuccess: enqueueAndRun, + }) }, onFireTask: task => { - if (inputClosed) return - void (async () => { - if (task.agentId) { - const prepared = await prepareAutonomyTurnPrompt({ - basePrompt: task.prompt, - trigger: 'scheduled-task', - currentDir: cwd(), - }) - if (inputClosed) return - const command = await commitAutonomyQueuedPrompt({ - prepared, - currentDir: cwd(), - sourceId: task.id, - sourceLabel: task.prompt, - workload: WORKLOAD_CRON, - }) - await markAutonomyRunFailed( - command.autonomy!.runId, - `No teammate runtime available for scheduled task owner ${task.agentId} in headless mode.`, - ) - return - } - const prepared = await prepareAutonomyTurnPrompt({ + if (task.agentId) { + dispatchHeadlessCronCommand({ basePrompt: task.prompt, - trigger: 'scheduled-task', - currentDir: cwd(), - }) - if (inputClosed) return - const command = await commitAutonomyQueuedPrompt({ - prepared, - currentDir: cwd(), sourceId: task.id, sourceLabel: task.prompt, - workload: WORKLOAD_CRON, - }) - if (inputClosed) return - enqueue({ - ...command, - uuid: randomUUID(), + logSuffix: ` ${task.id}`, + onSuccess: async command => { + await markAutonomyRunFailed( + command.autonomy!.runId, + `No teammate runtime available for scheduled task owner ${task.agentId} in headless mode.`, + command.autonomy!.rootDir, + ) + }, }) - void run() - })() + return + } + dispatchHeadlessCronCommand({ + basePrompt: task.prompt, + sourceId: task.id, + sourceLabel: task.prompt, + logSuffix: ` ${task.id}`, + onSuccess: enqueueAndRun, + }) }, isLoading: () => running || inputClosed, getJitterConfig: cronJitterConfigModule?.getCronJitterConfig, diff --git a/src/commands/skill-learning/index.ts b/src/commands/skill-learning/index.ts index a5afb655d1..6fff9c5276 100644 --- a/src/commands/skill-learning/index.ts +++ b/src/commands/skill-learning/index.ts @@ -1,5 +1,5 @@ import type { Command } from '../../commands.js' -import { isSkillLearningEnabled } from '../../services/skillLearning/featureCheck.js' +import { isSkillLearningCompiledIn } from '../../services/skillLearning/featureCheck.js' const skillLearning = { type: 'local-jsx', @@ -7,7 +7,10 @@ const skillLearning = { description: 'Manage skill learning (observe, analyze, evolve)', argumentHint: '[start|stop|about|status|ingest|evolve|export|import|prune|promote|projects]', - isEnabled: () => isSkillLearningEnabled(), + // The slash command is visible whenever the subsystem is compiled in. + // Whether the runtime feature is actually doing work is a separate + // concern controlled by `/skill-learning start` (see featureCheck.ts). + isEnabled: () => isSkillLearningCompiledIn(), isHidden: false, load: () => import('./skillPanel.js'), } satisfies Command diff --git a/src/commands/skill-search/index.ts b/src/commands/skill-search/index.ts index e3c35aea07..814a6af137 100644 --- a/src/commands/skill-search/index.ts +++ b/src/commands/skill-search/index.ts @@ -1,10 +1,14 @@ import type { Command } from '../../commands.js' +import { isSkillSearchCompiledIn } from '../../services/skillSearch/featureCheck.js' const skillSearch = { type: 'local-jsx', name: 'skill-search', description: 'Control automatic skill matching during conversations', argumentHint: '[start|stop|about|status]', + // Visible whenever the subsystem is compiled in (build flag); runtime + // activation is separate and operator-controlled via /skill-search start. + isEnabled: () => isSkillSearchCompiledIn(), isHidden: false, load: () => import('./skillSearchPanel.js'), } satisfies Command diff --git a/src/daemon/main.ts b/src/daemon/main.ts index 513103e9ae..0d3855ddb2 100644 --- a/src/daemon/main.ts +++ b/src/daemon/main.ts @@ -30,6 +30,7 @@ interface WorkerState { failureCount: number parked: boolean lastStartTime: number + restartTimer: ReturnType | null } /** @@ -241,6 +242,7 @@ async function runSupervisor(args: string[]): Promise { failureCount: 0, parked: false, lastStartTime: 0, + restartTimer: null, }, ] @@ -261,6 +263,10 @@ async function runSupervisor(args: string[]): Promise { controller.abort() removeDaemonState() for (const w of workers) { + if (w.restartTimer) { + clearTimeout(w.restartTimer) + w.restartTimer = null + } if (w.process && !w.process.killed) { w.process.kill('SIGTERM') } @@ -288,22 +294,30 @@ async function runSupervisor(args: string[]): Promise { // Wait for all workers to exit await Promise.all( workers - .filter(w => w.process && !w.process.killed) + .filter(w => w.process && w.process.exitCode === null) .map( w => new Promise(resolve => { - if (!w.process) { + if (!w.process || w.process.exitCode !== null) { resolve() return } - w.process.on('exit', () => resolve()) + let killTimer: ReturnType | null = null + w.process.on('exit', () => { + if (killTimer) { + clearTimeout(killTimer) + killTimer = null + } + resolve() + }) // Force kill after grace period - setTimeout(() => { - if (w.process && !w.process.killed) { + killTimer = setTimeout(() => { + if (w.process && w.process.exitCode === null) { w.process.kill('SIGKILL') } resolve() }, 30_000) + killTimer.unref?.() }), ), ) @@ -398,11 +412,13 @@ function spawnWorker( `[daemon] worker '${worker.kind}' exited (code=${code}, signal=${sig}), restarting in ${worker.backoffMs}ms`, ) - setTimeout(() => { + worker.restartTimer = setTimeout(() => { + worker.restartTimer = null if (!signal.aborted && !worker.parked) { spawnWorker(worker, dir, config, signal) } }, worker.backoffMs) + worker.restartTimer.unref?.() // Exponential backoff worker.backoffMs = Math.min( diff --git a/src/entrypoints/cli.tsx b/src/entrypoints/cli.tsx index 844c4d7109..a535f35683 100644 --- a/src/entrypoints/cli.tsx +++ b/src/entrypoints/cli.tsx @@ -255,6 +255,29 @@ async function main(): Promise { return } + // Fast-path for `claude autonomy ...`: state inspection/management commands + // do not need the full interactive CLI bootstrap. The full Commander path + // imports main.tsx and runs root preAction initialization before the autonomy + // action; under coverage/CI that leaves unrelated handles around simple + // state-only subprocess calls. + if (args[0] === 'autonomy') { + profileCheckpoint('cli_autonomy_path') + const { getAutonomyCommandText } = await import( + '../cli/handlers/autonomy.js' + ) + const text = await getAutonomyCommandText(args.slice(1).join(' ')) + await new Promise((resolve, reject) => { + process.stdout.write(`${text}\n`, error => { + if (error) { + reject(error) + return + } + resolve() + }) + }) + process.exit(0) + } + // Fast-path for `--bg`/`--background` shortcut → daemon bg. if ( feature('BG_SESSIONS') && @@ -398,4 +421,4 @@ async function main(): Promise { } // eslint-disable-next-line custom-rules/no-top-level-side-effects -void main() +await main() diff --git a/src/hooks/__tests__/useScheduledTasks.test.ts b/src/hooks/__tests__/useScheduledTasks.test.ts new file mode 100644 index 0000000000..ce6b1f966a --- /dev/null +++ b/src/hooks/__tests__/useScheduledTasks.test.ts @@ -0,0 +1,80 @@ +import { afterEach, beforeEach, describe, expect, test } from 'bun:test' +import { + resetStateForTests, + setCwdState, + setOriginalCwd, + setProjectRoot, +} from '../../bootstrap/state' +import { createScheduledTaskQueuedCommand } from '../useScheduledTasks' +import { + listAutonomyRuns, + markAutonomyRunCompleted, +} from '../../utils/autonomyRuns' +import { resetAutonomyAuthorityForTests } from '../../utils/autonomyAuthority' +import { cleanupTempDir, createTempDir } from '../../../tests/mocks/file-system' + +let tempDir = '' + +beforeEach(async () => { + tempDir = await createTempDir('scheduled-tasks-') + resetStateForTests() + resetAutonomyAuthorityForTests() + setOriginalCwd(tempDir) + setProjectRoot(tempDir) + setCwdState(tempDir) +}) + +afterEach(async () => { + resetStateForTests() + resetAutonomyAuthorityForTests() + if (tempDir) { + await cleanupTempDir(tempDir) + } +}) + +describe('createScheduledTaskQueuedCommand', () => { + function createCommandForTest(task: { id: string; prompt: string }) { + return createScheduledTaskQueuedCommand(task, { + rootDir: tempDir, + currentDir: tempDir, + }) + } + + test('skips a scheduled task when the same source already has an active run', async () => { + const task = { + id: 'cron-1', + prompt: '/loop review the repository', + } + + const first = await createCommandForTest(task) + const second = await createCommandForTest(task) + const runs = await listAutonomyRuns(tempDir) + + expect(first).not.toBeNull() + expect(second).toBeNull() + expect(runs).toHaveLength(1) + expect(runs[0]).toMatchObject({ + trigger: 'scheduled-task', + status: 'queued', + sourceId: 'cron-1', + }) + }) + + test('allows a scheduled task after the previous same-source run completes', async () => { + const task = { + id: 'cron-1', + prompt: '/loop review the repository', + } + + const first = await createCommandForTest(task) + expect(first?.autonomy?.runId).toBeDefined() + + await markAutonomyRunCompleted(first!.autonomy!.runId, tempDir, 100) + const second = await createCommandForTest(task) + const runs = await listAutonomyRuns(tempDir) + + expect(second).not.toBeNull() + expect(runs).toHaveLength(2) + expect(runs.map(run => run.status).sort()).toEqual(['completed', 'queued']) + }) +}) diff --git a/src/hooks/useReplBridge.tsx b/src/hooks/useReplBridge.tsx index df9669e2e0..fb05c1c948 100644 --- a/src/hooks/useReplBridge.tsx +++ b/src/hooks/useReplBridge.tsx @@ -189,12 +189,6 @@ export function useReplBridge( } let cancelled = false - // Map of pending bridge permission response handlers, keyed by request_id. - // Defined at useEffect scope so the cleanup function can clear it on unmount. - const pendingPermissionHandlers = new Map< - string, - (response: BridgePermissionResponse) => void - >() // Capture messages.length now so we don't re-send initial messages // through writeMessages after the bridge connects. const initialMessageCount = messages.length @@ -467,6 +461,13 @@ export function useReplBridge( } } + // Map of pending bridge permission response handlers, keyed by request_id. + // Each entry is an onResponse handler waiting for CCR to reply. + const pendingPermissionHandlers = new Map< + string, + (response: BridgePermissionResponse) => void + >() + // Dispatch incoming control_response messages to registered handlers function handlePermissionResponse(msg: SDKControlResponse): void { const requestId = msg.response?.request_id @@ -817,10 +818,6 @@ export function useReplBridge( return () => { cancelled = true - // Release all pending permission handlers so their closures (which - // may capture React state/setters) can be GC'd immediately rather - // than waiting for the entire useEffect closure to become unreachable. - pendingPermissionHandlers.clear() clearTimeout(failureTimeoutRef.current) failureTimeoutRef.current = undefined if (handleRef.current) { diff --git a/src/hooks/useScheduledTasks.ts b/src/hooks/useScheduledTasks.ts index e1dadbe851..5013270b2b 100644 --- a/src/hooks/useScheduledTasks.ts +++ b/src/hooks/useScheduledTasks.ts @@ -10,13 +10,18 @@ import type { Message } from '../types/message.js' import { getCwd } from '../utils/cwd.js' import { getCronJitterConfig } from '../utils/cronJitterConfig.js' import { createCronScheduler } from '../utils/cronScheduler.js' -import { removeCronTasks } from '../utils/cronTasks.js' -import { createAutonomyQueuedPrompt } from '../utils/autonomyRuns.js' -import { markAutonomyRunFailed } from '../utils/autonomyRuns.js' +import { removeCronTasks, type CronTask } from '../utils/cronTasks.js' +import { + createAutonomyQueuedPrompt, + createAutonomyQueuedPromptIfNoActiveSource, + markAutonomyRunCancelled, + markAutonomyRunFailed, +} from '../utils/autonomyRuns.js' import { logForDebugging } from '../utils/debug.js' import { enqueuePendingNotification } from '../utils/messageQueueManager.js' import { createScheduledTaskFireMessage } from '../utils/messages.js' import { WORKLOAD_CRON } from '../utils/workloadContext.js' +import type { QueuedCommand } from '../types/textInputTypes.js' type Props = { isLoading: boolean @@ -32,6 +37,32 @@ type Props = { setMessages: React.Dispatch> } +export async function createScheduledTaskQueuedCommand( + task: Pick, + options?: { + rootDir?: string + currentDir?: string + shouldCreate?: () => boolean + }, +): Promise { + const command = await createAutonomyQueuedPromptIfNoActiveSource({ + basePrompt: task.prompt, + trigger: 'scheduled-task', + rootDir: options?.rootDir, + currentDir: options?.currentDir ?? getCwd(), + sourceId: task.id, + sourceLabel: task.prompt, + workload: WORKLOAD_CRON, + shouldCreate: options?.shouldCreate, + }) + if (!command) { + logForDebugging( + `[ScheduledTasks] skipping ${task.id}: previous run still queued or running`, + ) + } + return command +} + /** * REPL wrapper for the cron scheduler. Mounts the scheduler once and tears * it down on unmount. Fired prompts go into the command queue as 'later' @@ -71,16 +102,25 @@ export function useScheduledTasks({ // forward isMeta, so their messages remain visible in the // transcript. This is acceptable since normal mode is not the // primary use case for scheduled tasks. + let disposed = false const enqueueForLead = async (prompt: string) => { const command = await createAutonomyQueuedPrompt({ basePrompt: prompt, trigger: 'scheduled-task', currentDir: getCwd(), workload: WORKLOAD_CRON, + shouldCreate: () => !disposed, }) if (!command) { return } + if (disposed) { + await markAutonomyRunCancelled( + command.autonomy!.runId, + command.autonomy!.rootDir, + ) + return + } enqueuePendingNotification(command) } @@ -90,7 +130,12 @@ export function useScheduledTasks({ // which is populated from disk at scheduler startup — this path only // handles team-lead durable crons. onFire: prompt => { - void enqueueForLead(prompt) + void enqueueForLead(prompt).catch(error => + logForDebugging( + `[ScheduledTasks] failed to enqueue missed task prompt: ${error}`, + { level: 'error' }, + ), + ) }, // Normal fires receive the full CronTask so we can route by agentId. onFireTask: task => { @@ -101,22 +146,26 @@ export function useScheduledTasks({ store.getState().tasks, ) if (teammate && !isTerminalTaskStatus(teammate.status)) { - const command = await createAutonomyQueuedPrompt({ - basePrompt: task.prompt, - trigger: 'scheduled-task', - currentDir: getCwd(), - sourceId: task.id, - sourceLabel: task.prompt, - workload: WORKLOAD_CRON, - }) + const command = await createScheduledTaskQueuedCommand( + task, + { shouldCreate: () => !disposed }, + ) if (!command) { return } + if (disposed) { + await markAutonomyRunCancelled( + command.autonomy!.runId, + command.autonomy!.rootDir, + ) + return + } const injected = injectUserMessageToTeammate( teammate.id, command.value as string, { autonomyRunId: command.autonomy?.runId, + autonomyRootDir: command.autonomy?.rootDir, origin: command.origin, }, setAppState, @@ -125,6 +174,7 @@ export function useScheduledTasks({ await markAutonomyRunFailed( command.autonomy.runId, `Teammate ${task.agentId} exited before the scheduled message could be delivered.`, + command.autonomy.rootDir, ) } return @@ -139,24 +189,32 @@ export function useScheduledTasks({ return } - const command = await createAutonomyQueuedPrompt({ - basePrompt: task.prompt, - trigger: 'scheduled-task', - currentDir: getCwd(), - sourceId: task.id, - sourceLabel: task.prompt, - workload: WORKLOAD_CRON, - }) + const command = await createScheduledTaskQueuedCommand( + task, + { shouldCreate: () => !disposed }, + ) if (!command) { return } + if (disposed) { + await markAutonomyRunCancelled( + command.autonomy!.runId, + command.autonomy!.rootDir, + ) + return + } const msg = createScheduledTaskFireMessage( `Running scheduled task (${formatCronFireTime(new Date())})`, ) setMessages(prev => [...prev, msg]) enqueuePendingNotification(command) - })() + })().catch(error => + logForDebugging( + `[ScheduledTasks] failed to enqueue task ${task.id}: ${error}`, + { level: 'error' }, + ), + ) }, isLoading: () => isLoadingRef.current, assistantMode, @@ -164,7 +222,10 @@ export function useScheduledTasks({ isKilled: () => !isKairosCronEnabled(), }) scheduler.start() - return () => scheduler.stop() + return () => { + disposed = true + scheduler.stop() + } // assistantMode is stable for the session lifetime; store/setAppState are // stable refs from useSyncExternalStore; setMessages is a stable useCallback. // eslint-disable-next-line react-hooks/exhaustive-deps diff --git a/src/proactive/useProactive.ts b/src/proactive/useProactive.ts index aa79ef7584..2853725fa5 100644 --- a/src/proactive/useProactive.ts +++ b/src/proactive/useProactive.ts @@ -9,7 +9,9 @@ import { useEffect, useRef } from 'react' import type { QueuedCommand } from '../types/textInputTypes.js' import { TICK_TAG } from '../constants/xml.js' import { getCwd } from '../utils/cwd.js' +import { cancelQueuedAutonomyCommands } from '../utils/autonomyQueueLifecycle.js' import { createProactiveAutonomyCommands } from '../utils/autonomyRuns.js' +import { logForDebugging } from '../utils/debug.js' import { isProactiveActive, isProactivePaused, @@ -38,6 +40,8 @@ export function useProactive(opts: UseProactiveOpts): void { if (!isProactiveActive()) return let timer: ReturnType | null = null + let disposed = false + let generating = false function scheduleTick(): void { const nextTs = Date.now() + TICK_INTERVAL_MS @@ -66,25 +70,51 @@ export function useProactive(opts: UseProactiveOpts): void { isLoading || isInPlanMode || hasActiveLocalJsxUI || - queuedCommandsLength > 0 + queuedCommandsLength > 0 || + generating ) { scheduleTick() return } + generating = true void (async () => { const commands = await createProactiveAutonomyCommands({ basePrompt: `<${TICK_TAG}>${new Date().toLocaleTimeString()}`, currentDir: getCwd(), + shouldCreate: () => !disposed, }) - for (const command of commands) { - // Always queue proactive turns. This avoids races where the prompt - // is built asynchronously, a user turn starts meanwhile, and a - // direct-submit path would silently drop the autonomy turn after - // consuming its heartbeat due-state. - optsRef.current.onQueueTick(command) + if (disposed) { + await cancelQueuedAutonomyCommands({ commands }) + return + } + const queuedCommands: QueuedCommand[] = [] + try { + for (const command of commands) { + // Always queue proactive turns. This avoids races where the prompt + // is built asynchronously, a user turn starts meanwhile, and a + // direct-submit path would silently drop the autonomy turn after + // consuming its heartbeat due-state. + optsRef.current.onQueueTick(command) + queuedCommands.push(command) + } + } catch (error) { + await cancelQueuedAutonomyCommands({ + commands: commands.filter( + command => !queuedCommands.includes(command), + ), + }) + throw error } })() + .catch(error => + logForDebugging(`[Proactive] failed to create tick: ${error}`, { + level: 'error', + }), + ) + .finally(() => { + generating = false + }) // Schedule next tick scheduleTick() @@ -94,6 +124,7 @@ export function useProactive(opts: UseProactiveOpts): void { scheduleTick() return () => { + disposed = true if (timer !== null) { clearTimeout(timer) timer = null diff --git a/src/query.ts b/src/query.ts index fc7830727a..b7bce909c1 100644 --- a/src/query.ts +++ b/src/query.ts @@ -71,10 +71,16 @@ const jobClassifier = feature('TEMPLATES') : null /* eslint-enable @typescript-eslint/no-require-imports */ import { + enqueue, remove as removeFromQueue, getCommandsByMaxPriority, isSlashCommand, } from './utils/messageQueueManager.js' +import { + type AutonomyTurnOutcome, + claimConsumableQueuedAutonomyCommands, + finalizeAutonomyCommandsForTurn, +} from './utils/autonomyQueueLifecycle.js' import { notifyCommandLifecycle } from './utils/commandLifecycle.js' import { headlessProfilerCheckpoint } from './utils/headlessProfiler.js' import { @@ -92,6 +98,7 @@ import { SLEEP_TOOL_NAME } from '@claude-code-best/builtin-tools/tools/SleepTool import { executePostSamplingHooks } from './utils/hooks/postSamplingHooks.js' import { executeStopFailureHooks } from './utils/hooks.js' import type { QuerySource } from './constants/querySource.js' +import type { QueuedCommand } from './types/textInputTypes.js' import { createDumpPromptsFetch } from './services/api/dumpPrompts.js' import { StreamingToolExecutor } from './services/tools/StreamingToolExecutor.js' import { queryCheckpoint } from './utils/queryProfiler.js' @@ -111,7 +118,11 @@ import { } from './bootstrap/state.js' import { createBudgetTracker, checkTokenBudget } from './query/tokenBudget.js' import { count } from './utils/array.js' -import { createTrace, endTrace, isLangfuseEnabled } from './services/langfuse/index.js' +import { + createTrace, + endTrace, + isLangfuseEnabled, +} from './services/langfuse/index.js' import { getAPIProvider } from './utils/model/providers.js' /* eslint-disable @typescript-eslint/no-require-imports */ @@ -129,7 +140,11 @@ function* yieldMissingToolResultBlocks( ) { for (const assistantMessage of assistantMessages) { // Extract all tool use blocks from this assistant message - const toolUseBlocks = (Array.isArray(assistantMessage.message?.content) ? assistantMessage.message.content : []).filter( + const toolUseBlocks = ( + Array.isArray(assistantMessage.message?.content) + ? assistantMessage.message.content + : [] + ).filter( (content: { type: string }) => content.type === 'tool_use', ) as ToolUseBlock[] @@ -181,6 +196,33 @@ function isWithheldMaxOutputTokens( return msg?.type === 'assistant' && msg.apiError === 'max_output_tokens' } +function getAutonomyTurnOutcome(params: { + terminal?: Terminal + thrownError?: unknown +}): AutonomyTurnOutcome { + if (params.thrownError !== undefined) { + return { type: 'failed', error: params.thrownError } + } + + const terminal = params.terminal + const reason = terminal?.reason + switch (reason) { + case 'completed': + return { type: 'completed' } + case undefined: + case 'aborted_streaming': + case 'aborted_tools': + return { type: 'cancelled' } + case 'model_error': + return { type: 'failed', error: terminal.error } + default: + return { + type: 'failed', + message: `query ended without successful completion: ${reason}`, + } + } +} + export type QueryParams = { messages: Message[] systemPrompt: SystemPrompt @@ -230,6 +272,7 @@ export async function* query( Terminal > { const consumedCommandUuids: string[] = [] + const consumedAutonomyCommands: QueuedCommand[] = [] // Create Langfuse trace for this query turn (no-op if not configured). // When called as a sub-agent, langfuseTrace is already set by runAgent() @@ -238,8 +281,9 @@ export async function* query( logForDebugging( `[query] ownsTrace=${ownsTrace} incoming langfuseTrace=${params.toolUseContext.langfuseTrace ? 'present' : 'null/undefined'} isLangfuseEnabled=${isLangfuseEnabled()}`, ) - const langfuseTrace = params.toolUseContext.langfuseTrace - ?? (isLangfuseEnabled() + const langfuseTrace = + params.toolUseContext.langfuseTrace ?? + (isLangfuseEnabled() ? createTrace({ sessionId: getSessionId(), model: params.toolUseContext.options.mainLoopModel, @@ -258,9 +302,34 @@ export async function* query( : params let terminal: Terminal | undefined + let didThrow = false + let thrownError: unknown try { - terminal = yield* queryLoop(paramsWithTrace, consumedCommandUuids) + terminal = yield* queryLoop( + paramsWithTrace, + consumedCommandUuids, + consumedAutonomyCommands, + ) + } catch (error) { + didThrow = true + thrownError = error + throw error } finally { + await finalizeAutonomyCommandsForTurn({ + commands: consumedAutonomyCommands, + outcome: getAutonomyTurnOutcome({ + terminal, + ...(didThrow ? { thrownError } : {}), + }), + priority: 'later', + }) + .then(nextCommands => { + for (const command of nextCommands) { + enqueue(command) + } + }) + .catch(logError) + // Only end the trace if we created it — sub-agents own their traces if (ownsTrace) { const isAborted = @@ -283,6 +352,7 @@ export async function* query( async function* queryLoop( params: QueryParams, consumedCommandUuids: string[], + consumedAutonomyCommands: QueuedCommand[], ): AsyncGenerator< | StreamEvent | RequestStartEvent @@ -790,7 +860,14 @@ async function* queryLoop( let yieldMessage: typeof message = message if (message.type === 'assistant') { const assistantMsg = message as AssistantMessage - const contentArr = Array.isArray(assistantMsg.message?.content) ? assistantMsg.message.content as unknown as Array<{ type: string; input?: unknown; name?: string; [key: string]: unknown }> : [] + const contentArr = Array.isArray(assistantMsg.message?.content) + ? (assistantMsg.message.content as unknown as Array<{ + type: string + input?: unknown + name?: string + [key: string]: unknown + }>) + : [] let clonedContent: typeof contentArr | undefined for (let i = 0; i < contentArr.length; i++) { const block = contentArr[i]! @@ -826,7 +903,10 @@ async function* queryLoop( if (clonedContent) { yieldMessage = { ...message, - message: { ...(assistantMsg.message ?? {}), content: clonedContent }, + message: { + ...(assistantMsg.message ?? {}), + content: clonedContent, + }, } as typeof message } } @@ -872,7 +952,11 @@ async function* queryLoop( const assistantMessage = message as AssistantMessage assistantMessages.push(assistantMessage) - const msgToolUseBlocks = (Array.isArray(assistantMessage.message?.content) ? assistantMessage.message.content : []).filter( + const msgToolUseBlocks = ( + Array.isArray(assistantMessage.message?.content) + ? assistantMessage.message.content + : [] + ).filter( (content: { type: string }) => content.type === 'tool_use', ) as ToolUseBlock[] if (msgToolUseBlocks.length > 0) { @@ -1005,7 +1089,10 @@ async function* queryLoop( logEvent('tengu_query_error', { assistantMessages: assistantMessages.length, toolUses: assistantMessages.flatMap(_ => - (Array.isArray(_.message?.content) ? _.message.content as Array<{ type: string }> : []).filter(content => content.type === 'tool_use'), + (Array.isArray(_.message?.content) + ? (_.message.content as Array<{ type: string }>) + : [] + ).filter(content => content.type === 'tool_use'), ).length, queryChainId: queryChainIdForAnalytics, @@ -1307,7 +1394,10 @@ async function* queryLoop( // error → hook blocking → retry → error → … if (lastMessage?.isApiErrorMessage) { void executeStopFailureHooks(lastMessage, toolUseContext) - return { reason: 'completed' } + return { + reason: 'model_error', + error: lastMessage.error ?? lastMessage.apiError ?? 'api_error', + } } const stopHookResult = yield* handleStopHooks( @@ -1408,7 +1498,6 @@ async function* queryLoop( queryCheckpoint('query_tool_execution_start') - if (streamingToolExecutor) { logEvent('tengu_streaming_tool_execution_used', { tool_count: toolUseBlocks.length, @@ -1468,9 +1557,14 @@ async function* queryLoop( const lastAssistantMessage = assistantMessages.at(-1) let lastAssistantText: string | undefined if (lastAssistantMessage) { - const textBlocks = (Array.isArray(lastAssistantMessage.message?.content) ? lastAssistantMessage.message.content as Array<{ type: string; text?: string }> : []).filter( - block => block.type === 'text', - ) + const textBlocks = ( + Array.isArray(lastAssistantMessage.message?.content) + ? (lastAssistantMessage.message.content as Array<{ + type: string + text?: string + }>) + : [] + ).filter(block => block.type === 'text') if (textBlocks.length > 0) { const lastTextBlock = textBlocks.at(-1) if (lastTextBlock && 'text' in lastTextBlock) { @@ -1622,12 +1716,32 @@ async function* queryLoop( // user prompts, even if someone stamps an agentId on one. return cmd.mode === 'task-notification' && cmd.agentId === currentAgentId }) + const queuedAutonomyClaim = await claimConsumableQueuedAutonomyCommands( + queuedCommandsSnapshot, + ) + if (queuedAutonomyClaim.staleCommands.length > 0) { + removeFromQueue(queuedAutonomyClaim.staleCommands) + } + + const claimedConsumedCommands = queuedAutonomyClaim.claimedCommands.filter( + cmd => cmd.mode === 'prompt' || cmd.mode === 'task-notification', + ) + if (claimedConsumedCommands.length > 0) { + consumedAutonomyCommands.push(...claimedConsumedCommands) + for (const cmd of claimedConsumedCommands) { + if (cmd.uuid) { + consumedCommandUuids.push(cmd.uuid) + notifyCommandLifecycle(cmd.uuid, 'started') + } + } + removeFromQueue(claimedConsumedCommands) + } for await (const attachment of getAttachmentMessages( null, updatedToolUseContext, null, - queuedCommandsSnapshot, + queuedAutonomyClaim.attachmentCommands, [...messagesForQuery, ...assistantMessages, ...toolResults], querySource, )) { @@ -1659,7 +1773,6 @@ async function* queryLoop( pendingMemoryPrefetch.consumedOnIteration = turnCount - 1 } - // Inject prefetched skill discovery. collectSkillDiscoveryPrefetch emits // hidden_by_main_turn — true when the prefetch resolved before this point // (should be >98% at AKI@250ms / Haiku@573ms vs turn durations of 2-30s). @@ -1675,8 +1788,11 @@ async function* queryLoop( // Remove only commands that were actually consumed as attachments. // Prompt and task-notification commands are converted to attachments above. - const consumedCommands = queuedCommandsSnapshot.filter( - cmd => cmd.mode === 'prompt' || cmd.mode === 'task-notification', + const claimedCommandSet = new Set(claimedConsumedCommands) + const consumedCommands = queuedAutonomyClaim.attachmentCommands.filter( + cmd => + (cmd.mode === 'prompt' || cmd.mode === 'task-notification') && + !claimedCommandSet.has(cmd), ) if (consumedCommands.length > 0) { for (const cmd of consumedCommands) { diff --git a/src/query/transitions.ts b/src/query/transitions.ts index f8fe515514..ba2fa8b401 100644 --- a/src/query/transitions.ts +++ b/src/query/transitions.ts @@ -1,3 +1,20 @@ -// Auto-generated stub — replace with real implementation -export type Terminal = any; -export type Continue = any; +export type Terminal = + | { reason: 'completed' } + | { reason: 'blocking_limit' } + | { reason: 'image_error' } + | { reason: 'model_error'; error?: unknown } + | { reason: 'aborted_streaming' } + | { reason: 'aborted_tools' } + | { reason: 'prompt_too_long' } + | { reason: 'stop_hook_prevented' } + | { reason: 'hook_stopped' } + | { reason: 'max_turns'; turnCount: number } + +export type Continue = + | { reason: 'collapse_drain_retry'; committed: number } + | { reason: 'reactive_compact_retry' } + | { reason: 'max_output_tokens_escalate' } + | { reason: 'max_output_tokens_recovery'; attempt: number } + | { reason: 'stop_hook_blocking' } + | { reason: 'token_budget_continuation' } + | { reason: 'next_turn' } diff --git a/src/screens/REPL.tsx b/src/screens/REPL.tsx index 28e4132d84..fe26e38cfb 100644 --- a/src/screens/REPL.tsx +++ b/src/screens/REPL.tsx @@ -79,10 +79,9 @@ import { isEnvTruthy } from '../utils/envUtils.js'; import { formatTokens, truncateToWidth } from '../utils/format.js'; import { consumeEarlyInput } from '../utils/earlyInput.js'; import { - finalizeAutonomyRunCompleted, - finalizeAutonomyRunFailed, - markAutonomyRunRunning, -} from '../utils/autonomyRuns.js'; + claimConsumableQueuedAutonomyCommands, + finalizeAutonomyCommandsForTurn, +} from '../utils/autonomyQueueLifecycle.js'; import { setMemberActive } from '../utils/swarm/teamHelpers.js'; import { @@ -3054,18 +3053,19 @@ export function REPL({ setMessages(old => { const postBoundary = getMessagesAfterCompactBoundary(old, { includeSnipped: true, - }) + }); // Hard cap: keep at most 500 messages in fullscreen scrollback // to prevent unbounded memory growth in multi-day sessions. // normalizeMessages/applyGrouping are O(n), and Ink fiber // trees cost ~250KB RSS per message. Without this cap, // scrollback after several compactions can reach thousands // of messages (observed: 13k+, 1GB+ heap). - const MAX_FULLSCREEN_SCROLLBACK = 500 - const kept = postBoundary.length > MAX_FULLSCREEN_SCROLLBACK - ? postBoundary.slice(-MAX_FULLSCREEN_SCROLLBACK) - : postBoundary - return [...kept, newMessage] + const MAX_FULLSCREEN_SCROLLBACK = 500; + const kept = + postBoundary.length > MAX_FULLSCREEN_SCROLLBACK + ? postBoundary.slice(-MAX_FULLSCREEN_SCROLLBACK) + : postBoundary; + return [...kept, newMessage]; }); } else { setMessages(() => [newMessage]); @@ -3098,13 +3098,10 @@ export function REPL({ // so interleaved non-ephemeral messages caused duplicate progress // entries to accumulate (observed 13k+ entries in sleep-heavy sessions). for (let i = oldMessages.length - 1; i >= 0; i--) { - const m = oldMessages[i]! - if (m.type !== 'progress') break - const mData = m.data as Record | undefined - if ( - m.parentToolUseID === newMessage.parentToolUseID && - mData?.type === newData.type - ) { + const m = oldMessages[i]!; + if (m.type !== 'progress') break; + const mData = m.data as Record | undefined; + if (m.parentToolUseID === newMessage.parentToolUseID && mData?.type === newData.type) { const copy = oldMessages.slice(); copy[i] = newMessage; return copy; @@ -3477,7 +3474,7 @@ export function REPL({ onBeforeQueryCallback?: (input: string, newMessages: MessageType[]) => Promise, input?: string, effort?: EffortValue, - ): Promise => { + ): Promise => { // If this is a teammate, mark them as active when starting a turn if (isAgentSwarmsEnabled()) { const teamName = getTeamName(); @@ -3508,7 +3505,7 @@ export function REPL({ logEvent('tengu_concurrent_onquery_enqueued', {}); } }); - return; + return false; } try { @@ -3541,7 +3538,7 @@ export function REPL({ if (onBeforeQueryCallback && input) { const shouldProceed = await onBeforeQueryCallback(input, latestMessages); if (!shouldProceed) { - return; + return true; } } @@ -3690,6 +3687,7 @@ export function REPL({ } } } + return true; }, [onQueryImpl, setAppState, resetLoadingState, queryGuard, mrOnBeforeQuery, mrOnTurnComplete], ); @@ -4844,44 +4842,62 @@ export function REPL({ } satisfies QueuedCommand) : input; - const newAbortController = createAbortController(); - setAbortController(newAbortController); + void (async () => { + const claim = await claimConsumableQueuedAutonomyCommands([queuedCommand]); + const command = claim.attachmentCommands[0]; + if (!command) return; - // Create a user message with the formatted content (includes XML wrapper) - const userMessage = createUserMessage({ - content: queuedCommand.value as string, - isMeta: queuedCommand.isMeta ? true : undefined, - origin: queuedCommand.origin, - }); + const newAbortController = createAbortController(); + setAbortController(newAbortController); - const autonomyRunId = queuedCommand.autonomy?.runId; - if (autonomyRunId) { - void markAutonomyRunRunning(autonomyRunId); - } + // Create a user message with the formatted content (includes XML wrapper) + const userMessage = createUserMessage({ + content: command.value, + isMeta: command.isMeta ? true : undefined, + origin: command.origin, + }); - void onQuery([userMessage], newAbortController, true, [], mainLoopModel) - .then(() => { - if (autonomyRunId) { - void finalizeAutonomyRunCompleted({ - runId: autonomyRunId, + let executed = false; + try { + executed = (await onQuery([userMessage], newAbortController, true, [], mainLoopModel)) !== false; + } catch (error: unknown) { + try { + await finalizeAutonomyCommandsForTurn({ + commands: claim.claimedCommands, + outcome: { type: 'failed', error }, currentDir: getCwd(), priority: 'later', - }).then(nextCommands => { - for (const command of nextCommands) { - enqueue(command); - } - }); - } - }) - .catch((error: unknown) => { - if (autonomyRunId) { - void finalizeAutonomyRunFailed({ - runId: autonomyRunId, - error: String(error), }); + } catch (finalizeError: unknown) { + logError(toError(finalizeError)); } logError(toError(error)); - }); + return; + } + + // Only finalize as completed when onQuery actually executed the turn + // (it returns false from the concurrent-guard path without running). + // Keep this finalize in its own try/catch so a failure here does not + // trigger a second finalize as `failed` for the same commands. + if (!executed) { + return; + } + try { + const nextCommands = await finalizeAutonomyCommandsForTurn({ + commands: claim.claimedCommands, + outcome: { type: 'completed' }, + currentDir: getCwd(), + priority: 'later', + }); + for (const nextCommand of nextCommands) { + enqueue(nextCommand); + } + } catch (finalizeError: unknown) { + logError(toError(finalizeError)); + } + })().catch((error: unknown) => { + logError(toError(error)); + }); return true; }, [onQuery, mainLoopModel, store], diff --git a/src/services/compact/postCompactCleanup.ts b/src/services/compact/postCompactCleanup.ts index 50cbfd6172..b89e3a0be6 100644 --- a/src/services/compact/postCompactCleanup.ts +++ b/src/services/compact/postCompactCleanup.ts @@ -5,9 +5,9 @@ import { getUserContext } from '../../context.js' import { clearSpeculativeChecks } from '@claude-code-best/builtin-tools/tools/BashTool/bashPermissions.js' import { clearClassifierApprovals } from '../../utils/classifierApprovals.js' import { resetGetMemoryFilesCache } from '../../utils/claudemd.js' +import { logError } from '../../utils/log.js' import { clearSessionMessagesCache } from '../../utils/sessionStorage.js' import { clearBetaTracingState } from '../../utils/telemetry/betaSessionTracing.js' -import { getLspServerManager } from '../../services/lsp/manager.js' import { resetMicrocompactState } from './microCompact.js' /** @@ -29,7 +29,7 @@ import { resetMicrocompactState } from './microCompact.js' * pass querySource — undefined is only safe for callers that are * genuinely main-thread-only (/compact, /clear). */ -export async function runPostCompactCleanup(querySource?: QuerySource): Promise { +export function runPostCompactCleanup(querySource?: QuerySource): void { // Subagents (agent:*) run in the same process and share module-level // state with the main thread. Only reset main-thread module-level state // (context-collapse, memory file cache) for main-thread compacts. @@ -70,20 +70,22 @@ export async function runPostCompactCleanup(querySource?: QuerySource): Promise< // cacheUtils resets. See compactConversation() for full rationale. clearBetaTracingState() if (feature('COMMIT_ATTRIBUTION')) { - void import('../../utils/attributionHooks.js').then(m => - m.sweepFileContentCache(), - ) + // Intentionally fire-and-forget: the file-content cache sweep is a + // best-effort memory release whose completion no caller depends on. + // Keeping `runPostCompactCleanup` synchronous lets compaction call sites + // (REPL post-compact handler, /compact command, autoCompact) finish their + // own state transitions without an extra microtask round-trip — the sweep + // catches up on the next event-loop tick. + // + // The .catch is required even though the current attributionHooks.ts is a + // no-op stub: without it, a future restored sweepFileContentCache that + // throws would surface as an unhandled promise rejection from a function + // whose synchronous signature gives callers no way to observe it. + void import('../../utils/attributionHooks.js') + .then(m => m.sweepFileContentCache()) + .catch(error => { + logError(error) + }) } clearSessionMessagesCache() - // Close all LSP-tracked files so servers release state for files no longer - // in the active context after compaction. Best-effort — LSP may not be - // initialized, and closeAllFiles catches per-file errors internally. - try { - const lspManager = getLspServerManager() - if (lspManager) { - await lspManager.closeAllFiles() - } - } catch { - // LSP module may not be available in all environments - } } diff --git a/src/services/skillLearning/featureCheck.ts b/src/services/skillLearning/featureCheck.ts index f67f17919c..9a1488ee1d 100644 --- a/src/services/skillLearning/featureCheck.ts +++ b/src/services/skillLearning/featureCheck.ts @@ -1,12 +1,36 @@ import { feature } from 'bun:bundle' +/** + * Build-time presence check: is the `/skill-learning` slash command + * compiled into this build? Used by the command registry's `isEnabled` so + * the command appears in the menu whenever it is buildable. Operators + * activate the subsystem itself via `/skill-learning start`, which flips + * `SKILL_LEARNING_ENABLED=1` and turns the runtime observers on (see + * `isSkillLearningEnabled`). + */ +export function isSkillLearningCompiledIn(): boolean { + if (feature('SKILL_LEARNING')) return true + return false +} + +/** + * Runtime activation check: is the skill-learning subsystem actively + * running (toolEvent, runtime, session observers attached, persisting + * observations to disk)? Off by default — the operator must run + * `/skill-learning start` (which sets `SKILL_LEARNING_ENABLED=1`). + * + * Legacy `FEATURE_SKILL_LEARNING=1` is also accepted for backward + * compatibility with operators who set it before the slash-command UX + * landed. + * + * Build-flag gating is intentionally NOT performed here: the command + * registry already gates command compilation on the build flag, and this + * function is only reached from code paths that the build flag has + * already let through. Decoupling keeps the test surface clean (tests + * exercise the env-var contract without needing to mock `bun:bundle`). + */ export function isSkillLearningEnabled(): boolean { - if (process.env.SKILL_LEARNING_ENABLED === '0') return false if (process.env.SKILL_LEARNING_ENABLED === '1') return true - if (process.env.FEATURE_SKILL_LEARNING === '0') return false if (process.env.FEATURE_SKILL_LEARNING === '1') return true - if (feature('SKILL_LEARNING')) { - return true - } return false } diff --git a/src/services/skillLearning/projectContext.ts b/src/services/skillLearning/projectContext.ts index a886cee6f5..3f50379735 100644 --- a/src/services/skillLearning/projectContext.ts +++ b/src/services/skillLearning/projectContext.ts @@ -45,15 +45,44 @@ export function getProjectContextPath(projectId: string): string { // in the tool.call hot path (one wrapper invocation per tool) that cost would // accumulate into the hundreds-of-ms range per session. Cache keyed by the // exact cwd string so different worktrees still get independent entries. +// +// Bounded with LRU eviction: long-lived processes that traverse many +// worktrees (e.g. multi-repo build orchestrators) would otherwise grow the +// cache without limit. Each entry holds a SkillLearningProjectContext +// (instinct + skill lists), so the cap ensures bounded memory regardless +// of cwd diversity. `defines.ts` originally flagged this as +// "无淘汰机制(非 GB 级主因)" — this fix closes that gap. +const PROJECT_CONTEXT_CACHE_MAX = 32 +const PROJECT_CONTEXT_CACHE_TRIM_TO = 24 const contextCache = new Map() const PERSIST_INTERVAL_MS = 5 * 60 * 1000 let lastPersistAt = 0 +function setProjectContextCache( + cwd: string, + ctx: SkillLearningProjectContext, +): void { + if (contextCache.has(cwd)) contextCache.delete(cwd) + contextCache.set(cwd, ctx) + if (contextCache.size > PROJECT_CONTEXT_CACHE_MAX) { + const toDrop = contextCache.size - PROJECT_CONTEXT_CACHE_TRIM_TO + const iter = contextCache.keys() + for (let i = 0; i < toDrop; i++) { + const next = iter.next() + if (next.done) break + contextCache.delete(next.value) + } + } +} + export function resolveProjectContext( cwd = process.cwd(), ): SkillLearningProjectContext { const cached = contextCache.get(cwd) if (cached) { + // Refresh insertion order so frequently-accessed cwds survive eviction. + contextCache.delete(cwd) + contextCache.set(cwd, cached) // Still touch the registry so long-lived processes keep `lastSeenAt` // reasonably fresh, but throttle the write so it doesn't fire on every // tool call. @@ -65,7 +94,7 @@ export function resolveProjectContext( return cached } const resolved = resolveContext(cwd) - contextCache.set(cwd, resolved) + setProjectContextCache(cwd, resolved) persistProjectContext(resolved) lastPersistAt = Date.now() return resolved diff --git a/src/services/skillLearning/promotion.ts b/src/services/skillLearning/promotion.ts index 12fb2805e6..7efc3c9bf4 100644 --- a/src/services/skillLearning/promotion.ts +++ b/src/services/skillLearning/promotion.ts @@ -23,8 +23,30 @@ export type PromotionOptions = { minConfidence?: number } +/** + * Set bounded with FIFO eviction. # promotions per session is small in + * practice (single digits), but a long-lived sandbox/daemon could push + * this if it never restarts. The cap is defensive and the degraded + * behaviour — re-promote if we exceed N then forget the oldest — is + * benign because promotion is idempotent at the lifecycle layer. + */ +const SESSION_PROMOTED_IDS_MAX = 256 +const SESSION_PROMOTED_IDS_TRIM_TO = 192 const sessionPromotedIds = new Set() +function recordSessionPromoted(id: string): void { + sessionPromotedIds.add(id) + if (sessionPromotedIds.size > SESSION_PROMOTED_IDS_MAX) { + const toDrop = sessionPromotedIds.size - SESSION_PROMOTED_IDS_TRIM_TO + const iter = sessionPromotedIds.values() + for (let i = 0; i < toDrop; i++) { + const next = iter.next() + if (next.done) break + sessionPromotedIds.delete(next.value) + } + } +} + export function resetPromotionBookkeeping(): void { sessionPromotedIds.clear() } @@ -103,7 +125,7 @@ export async function checkPromotion( } await saveInstinct(globalInstinct, globalOptions) - sessionPromotedIds.add(candidate.instinctId) + recordSessionPromoted(candidate.instinctId) promoted.push(candidate) } diff --git a/src/services/skillSearch/featureCheck.ts b/src/services/skillSearch/featureCheck.ts index 38dcda534e..dbdce72f3c 100644 --- a/src/services/skillSearch/featureCheck.ts +++ b/src/services/skillSearch/featureCheck.ts @@ -1,10 +1,30 @@ import { feature } from 'bun:bundle' -export function isSkillSearchEnabled(): boolean { - if (process.env.SKILL_SEARCH_ENABLED === '0') return false - if (process.env.SKILL_SEARCH_ENABLED === '1') return true - if (feature('EXPERIMENTAL_SKILL_SEARCH')) { - return true - } +/** + * Build-time presence check: is the `/skill-search` slash command compiled + * into this build? Used by the command registry's `isEnabled` so the + * command appears in the menu whenever it is buildable. Operators activate + * the subsystem itself via `/skill-search start`, which flips + * `SKILL_SEARCH_ENABLED=1` and turns the runtime hot paths on (see + * `isSkillSearchEnabled`). + */ +export function isSkillSearchCompiledIn(): boolean { + if (feature('EXPERIMENTAL_SKILL_SEARCH')) return true return false } + +/** + * Runtime activation check: is the skill-search subsystem currently doing + * work (intentNormalize Haiku calls, prefetch hot path, telemetry)? Off by + * default — the operator must run `/skill-search start` (which sets + * `SKILL_SEARCH_ENABLED=1`). See docs/agent/sur-skill-overflow-bugs.md §5. + * + * Build-flag gating is intentionally NOT performed here: the command + * registry already gates command compilation on the build flag, and this + * function is only reached from code paths that the build flag has + * already let through. Decoupling keeps the test surface clean (tests + * exercise the env-var contract without needing to mock `bun:bundle`). + */ +export function isSkillSearchEnabled(): boolean { + return process.env.SKILL_SEARCH_ENABLED === '1' +} diff --git a/src/services/skillSearch/intentNormalize.ts b/src/services/skillSearch/intentNormalize.ts index 9073958b85..7ec5c226e3 100644 --- a/src/services/skillSearch/intentNormalize.ts +++ b/src/services/skillSearch/intentNormalize.ts @@ -47,10 +47,35 @@ Output ONLY keywords. Nothing else.` const DEFAULT_TIMEOUT_MS = 6_000 const MAX_QUERY_CHARS = 500 const MAX_KEYWORDS_CHARS = 120 +/** + * Bound on the process-level query→keywords cache. Insertion-order LRU — + * Map iteration order is insertion order, so we evict from the front when + * size exceeds the cap. ~200 entries × ~600 bytes (query + keywords) ≈ + * 120 KB worst case. Without this cap the cache grew monotonically with + * the diversity of Chinese queries in a long session. + */ +const CACHE_MAX_ENTRIES = 200 +const CACHE_TRIM_TO = 150 /** Process-level cache. Keyed by the original (trimmed) query. */ const cache = new Map() +function setCachedQueryIntent(key: string, value: string): void { + // Refresh insertion order on hit-then-write so frequently-used keys + // stay alive (delete + set is the canonical Map-LRU idiom). + if (cache.has(key)) cache.delete(key) + cache.set(key, value) + if (cache.size > CACHE_MAX_ENTRIES) { + const toDrop = cache.size - CACHE_TRIM_TO + const iter = cache.keys() + for (let i = 0; i < toDrop; i++) { + const next = iter.next() + if (next.done) break + cache.delete(next.value) + } + } +} + export function isIntentNormalizeEnabled(): boolean { return process.env.SKILL_SEARCH_INTENT_ENABLED === '1' } @@ -74,12 +99,17 @@ export async function normalizeQueryIntent(query: string): Promise { if (!/[\u4e00-\u9fff]/.test(trimmed)) return trimmed const cached = cache.get(trimmed) - if (cached !== undefined) return cached + if (cached !== undefined) { + // Refresh LRU position so frequently-queried strings survive eviction. + cache.delete(trimmed) + cache.set(trimmed, cached) + return cached + } const capped = trimmed.slice(0, MAX_QUERY_CHARS) const keywords = await callHaiku(capped) const result = keywords ? `${trimmed} ${keywords}` : trimmed - cache.set(trimmed, result) + setCachedQueryIntent(trimmed, result) logForDebugging( `[skill-search] intent normalized: "${trimmed.slice(0, 40)}" -> "${keywords}"`, ) diff --git a/src/services/skillSearch/prefetch.ts b/src/services/skillSearch/prefetch.ts index 6d77f6c332..502524da18 100644 --- a/src/services/skillSearch/prefetch.ts +++ b/src/services/skillSearch/prefetch.ts @@ -14,9 +14,35 @@ import { readFile } from 'node:fs/promises' import { join } from 'node:path' import { parseFrontmatter } from '../../utils/frontmatterParser.js' +/** + * Per-session memoization to avoid re-emitting the same skill discovery / + * gap signal twice. Each Set is bounded to keep long-running sessions from + * monotonically accumulating skill names and signal keys forever (which + * was the original session-scoped-but-unbounded design). + * + * FIFO eviction by insertion order — once the cap is hit, the oldest + * entries roll off and may be re-recorded if rediscovered, which is the + * correct degraded behaviour: at worst we re-emit a duplicate signal, + * never silently drop a real one. + */ +const SESSION_TRACKING_MAX = 1000 +const SESSION_TRACKING_TRIM_TO = 750 const discoveredThisSession = new Set() const recordedGapSignals = new Set() +function addBoundedSessionEntry(set: Set, value: string): void { + set.add(value) + if (set.size > SESSION_TRACKING_MAX) { + const toDrop = set.size - SESSION_TRACKING_TRIM_TO + const iter = set.values() + for (let i = 0; i < toDrop; i++) { + const next = iter.next() + if (next.done) break + set.delete(next.value) + } + } +} + const AUTO_LOAD_MIN_SCORE = Number( process.env.SKILL_SEARCH_AUTOLOAD_MIN_SCORE ?? '0.30', ) @@ -185,7 +211,7 @@ async function maybeRecordSkillGap( const gapSignalKey = `${trigger}:${queryText.trim().toLowerCase()}` if (recordedGapSignals.has(gapSignalKey)) return undefined - recordedGapSignals.add(gapSignalKey) + addBoundedSessionEntry(recordedGapSignals, gapSignalKey) try { const [{ isSkillLearningEnabled }, { recordSkillGap }] = await Promise.all([ @@ -241,7 +267,7 @@ export async function startSkillDiscoveryPrefetch( const newResults = results.filter(r => !discoveredThisSession.has(r.name)) if (newResults.length === 0) return [] - for (const r of newResults) discoveredThisSession.add(r.name) + for (const r of newResults) addBoundedSessionEntry(discoveredThisSession, r.name) const signal: DiscoverySignal = { trigger: 'assistant_turn', @@ -305,7 +331,7 @@ export async function getTurnZeroSkillDiscovery( if (results.length === 0 && !gap) return null - for (const r of results) discoveredThisSession.add(r.name) + for (const r of results) addBoundedSessionEntry(discoveredThisSession, r.name) const signal: DiscoverySignal = { trigger: 'user_input', diff --git a/src/tasks/InProcessTeammateTask/InProcessTeammateTask.tsx b/src/tasks/InProcessTeammateTask/InProcessTeammateTask.tsx index 6b9d8c3cc1..52a202a368 100644 --- a/src/tasks/InProcessTeammateTask/InProcessTeammateTask.tsx +++ b/src/tasks/InProcessTeammateTask/InProcessTeammateTask.tsx @@ -73,6 +73,7 @@ export function injectUserMessageToTeammate( options: | { autonomyRunId?: string; + autonomyRootDir?: string; origin?: MessageOrigin; } | undefined, @@ -93,6 +94,9 @@ export function injectUserMessageToTeammate( if (options?.autonomyRunId !== undefined) { pendingMessage.autonomyRunId = options.autonomyRunId; } + if (options?.autonomyRootDir !== undefined) { + pendingMessage.autonomyRootDir = options.autonomyRootDir; + } if (options?.origin !== undefined) { pendingMessage.origin = options.origin; } diff --git a/src/tasks/InProcessTeammateTask/types.ts b/src/tasks/InProcessTeammateTask/types.ts index 90d9fb2120..17676647ba 100644 --- a/src/tasks/InProcessTeammateTask/types.ts +++ b/src/tasks/InProcessTeammateTask/types.ts @@ -22,6 +22,7 @@ export type TeammateIdentity = { export type PendingTeammateUserMessage = { message: string autonomyRunId?: string + autonomyRootDir?: string origin?: MessageOrigin } diff --git a/src/types/textInputTypes.ts b/src/types/textInputTypes.ts index 6b0a848d28..26e2c29ed3 100644 --- a/src/types/textInputTypes.ts +++ b/src/types/textInputTypes.ts @@ -361,6 +361,7 @@ export type QueuedCommand = { */ autonomy?: { runId: string + rootDir?: string trigger: 'scheduled-task' | 'proactive-tick' | 'managed-flow-step' sourceId?: string sourceLabel?: string diff --git a/src/utils/__tests__/autonomyAuthority.test.ts b/src/utils/__tests__/autonomyAuthority.test.ts index c9033134bf..a57b9a80dd 100644 --- a/src/utils/__tests__/autonomyAuthority.test.ts +++ b/src/utils/__tests__/autonomyAuthority.test.ts @@ -5,6 +5,7 @@ import { AUTONOMY_DIR, buildAutonomyTurnPrompt, loadAutonomyAuthority, + parseHeartbeatAuthorityTasks, resetAutonomyAuthorityForTests, } from '../autonomyAuthority' import { @@ -238,4 +239,79 @@ describe('autonomyAuthority', () => { expect(prompt).not.toContain('- weekly-report (7d): Ship the weekly report') expect(prompt).not.toContain('- gather (') }) + + test('parseHeartbeatAuthorityTasks ignores tasks: literals inside markdown code fences', () => { + const content = [ + '# HEARTBEAT.md', + '', + '```yaml', + 'tasks:', + ' - name: not-a-real-task', + ' interval: 1m', + ' prompt: "would-be-shadowed"', + '```', + '', + 'tasks:', + ' - name: real-task', + ' interval: 30m', + ' prompt: "Real prompt"', + ].join('\n') + + const parsed = parseHeartbeatAuthorityTasks(content) + + expect(parsed).toHaveLength(1) + expect(parsed[0]).toMatchObject({ + name: 'real-task', + interval: '30m', + prompt: 'Real prompt', + }) + }) + + test('parseHeartbeatAuthorityTasks ignores tasks: literals inside tilde markdown code fences', () => { + const content = [ + '# HEARTBEAT.md', + '', + '~~~yaml', + 'tasks:', + ' - name: not-a-real-task', + ' interval: 1m', + ' prompt: "would-be-shadowed"', + '~~~', + '', + 'tasks:', + ' - name: real-task', + ' interval: 30m', + ' prompt: "Real prompt"', + ].join('\n') + + const parsed = parseHeartbeatAuthorityTasks(content) + + expect(parsed).toHaveLength(1) + expect(parsed[0]).toMatchObject({ + name: 'real-task', + interval: '30m', + prompt: 'Real prompt', + }) + }) + + test('parseHeartbeatAuthorityTasks parses real tasks even when documentation precedes them', () => { + const content = [ + '# Heartbeat docs', + '', + 'See `tasks:` below — the parser keys on the literal at column 0.', + '', + 'tasks:', + ' - name: weekly', + ' interval: 7d', + ' prompt: "Ship report"', + ].join('\n') + + const parsed = parseHeartbeatAuthorityTasks(content) + + // Inline `tasks:` mention does NOT collide because it's not at column 0 + // on its own line — the existing line.trim() === 'tasks:' guard handles + // that case. This test pins the behaviour. + expect(parsed).toHaveLength(1) + expect(parsed[0]?.name).toBe('weekly') + }) }) diff --git a/src/utils/__tests__/autonomyFlows.test.ts b/src/utils/__tests__/autonomyFlows.test.ts index 8436844b48..8cf504fb82 100644 --- a/src/utils/__tests__/autonomyFlows.test.ts +++ b/src/utils/__tests__/autonomyFlows.test.ts @@ -126,6 +126,14 @@ describe('listAutonomyFlows', () => { runCount: 0, ownerKey: DEFAULT_AUTONOMY_OWNER_KEY, currentDir: tempDir, + boundary: [ + ' src/utils/** ', + '/absolute/not-allowed', + 'src\\windows', + '../outside', + 'src/utils/**', + 'docs/*.md', + ], stateJson: { currentStepIndex: 0, steps: [ @@ -147,6 +155,7 @@ describe('listAutonomyFlows', () => { expect(flows).toHaveLength(1) expect(flows[0]?.flowId).toBe('flow-1') expect(flows[0]?.syncMode).toBe('managed') + expect(flows[0]?.boundary).toEqual(['src/utils/**', 'docs/*.md']) expect(flows[0]?.stateJson?.steps).toHaveLength(1) }) @@ -191,6 +200,64 @@ describe('listAutonomyFlows', () => { const flows = await listAutonomyFlows(tempDir) expect(flows).toEqual([]) }) + + test('persistence pruning keeps active flows ahead of recent terminal history', async () => { + const flows: AutonomyFlowRecord[] = [ + { + flowId: 'old-active', + flowKey: 'managed:scheduled-task:old-active', + syncMode: 'managed', + ownerKey: DEFAULT_AUTONOMY_OWNER_KEY, + revision: 1, + trigger: 'scheduled-task', + status: 'queued', + goal: 'old active', + rootDir: tempDir, + currentDir: tempDir, + runCount: 0, + createdAt: 1, + updatedAt: 1, + }, + ...Array.from({ length: 100 }, (_, index) => ({ + flowId: `history-${index}`, + flowKey: `managed:scheduled-task:history-${index}`, + syncMode: 'managed' as const, + ownerKey: DEFAULT_AUTONOMY_OWNER_KEY, + revision: 1, + trigger: 'scheduled-task' as const, + status: 'succeeded' as const, + goal: `history ${index}`, + rootDir: tempDir, + currentDir: tempDir, + runCount: 1, + createdAt: 1_000 + index, + updatedAt: 1_000 + index, + endedAt: 2_000 + index, + })), + ] + const flowsPath = resolveAutonomyFlowsPath(tempDir) + await mkdir(join(tempDir, AUTONOMY_DIR), { recursive: true }) + await writeFile( + flowsPath, + `${JSON.stringify({ flows }, null, 2)}\n`, + 'utf-8', + ) + + await startManagedAutonomyFlow({ + trigger: 'scheduled-task', + goal: 'fresh active', + steps: TWO_STEPS, + rootDir: tempDir, + currentDir: tempDir, + sourceId: 'fresh-active', + nowMs: 9_999, + }) + + const persisted = await listAutonomyFlows(tempDir) + expect(persisted).toHaveLength(100) + expect(persisted.some(flow => flow.flowId === 'old-active')).toBe(true) + expect(persisted.some(flow => flow.flowId === 'history-0')).toBe(false) + }) }) describe('startManagedAutonomyFlow', () => { @@ -225,6 +292,49 @@ describe('startManagedAutonomyFlow', () => { expect(result!.nextStep!.step.name).toBe('gather') }) + test('normalizes and preserves boundary across completed flow restarts', async () => { + const first = await startManagedAutonomyFlow({ + trigger: 'scheduled-task', + goal: 'Scoped flow', + steps: [{ name: 'only', prompt: 'Do it' }], + rootDir: tempDir, + sourceId: 'scoped-src', + boundary: [' src/utils/** ', 'src\\bad', '/absolute', 'docs/*.md'], + nowMs: 1000, + }) + const flowId = first!.flow.flowId + + expect(first!.flow.boundary).toEqual(['src/utils/**', 'docs/*.md']) + + await queueManagedAutonomyFlowStepRun({ + flowId, + stepId: first!.nextStep!.step.stepId, + stepIndex: 0, + runId: 'run-1', + rootDir: tempDir, + nowMs: 2000, + }) + await markManagedAutonomyFlowStepCompleted({ + flowId, + runId: 'run-1', + rootDir: tempDir, + nowMs: 3000, + }) + + const restarted = await startManagedAutonomyFlow({ + trigger: 'scheduled-task', + goal: 'Scoped flow', + steps: [{ name: 'only', prompt: 'Do it again' }], + rootDir: tempDir, + sourceId: 'scoped-src', + nowMs: 4000, + }) + + expect(restarted!.started).toBe(true) + expect(restarted!.flow.flowId).toBe(flowId) + expect(restarted!.flow.boundary).toEqual(['src/utils/**', 'docs/*.md']) + }) + test('sets status=waiting when first step has waitFor', async () => { const result = await startManagedAutonomyFlow({ trigger: 'scheduled-task', diff --git a/src/utils/__tests__/autonomyPersistence.test.ts b/src/utils/__tests__/autonomyPersistence.test.ts index a265263eeb..f16877206e 100644 --- a/src/utils/__tests__/autonomyPersistence.test.ts +++ b/src/utils/__tests__/autonomyPersistence.test.ts @@ -54,6 +54,25 @@ describe('withAutonomyPersistenceLock', () => { ).rejects.toThrow('inner failure') }) + test('releases same-root lock bookkeeping after success and failure', async () => { + const { + getAutonomyPersistenceLockCountForTests, + withAutonomyPersistenceLock, + } = await import('../autonomyPersistence') + + expect(getAutonomyPersistenceLockCountForTests()).toBe(0) + + await withAutonomyPersistenceLock(tempDir, async () => 'ok') + expect(getAutonomyPersistenceLockCountForTests()).toBe(0) + + await expect( + withAutonomyPersistenceLock(tempDir, async () => { + throw new Error('inner failure') + }), + ).rejects.toThrow('inner failure') + expect(getAutonomyPersistenceLockCountForTests()).toBe(0) + }) + test('serializes concurrent calls on the same rootDir', async () => { const { withAutonomyPersistenceLock } = await import( '../autonomyPersistence' diff --git a/src/utils/__tests__/autonomyQueueLifecycle.test.ts b/src/utils/__tests__/autonomyQueueLifecycle.test.ts new file mode 100644 index 0000000000..2449f84051 --- /dev/null +++ b/src/utils/__tests__/autonomyQueueLifecycle.test.ts @@ -0,0 +1,279 @@ +import { afterEach, beforeEach, describe, expect, test } from 'bun:test' +import { createTempDir, cleanupTempDir } from '../../../tests/mocks/file-system' +import { getAttachmentMessages } from '../attachments' +import { + createAutonomyQueuedPrompt, + createProactiveAutonomyCommands, + getAutonomyRunById, + markAutonomyRunCancelled, + startManagedAutonomyFlowFromHeartbeatTask, +} from '../autonomyRuns' +import { getAutonomyFlowById, listAutonomyFlows } from '../autonomyFlows' +import { + cancelQueuedAutonomyCommands, + claimConsumableQueuedAutonomyCommands, + finalizeAutonomyCommandsForTurn, + partitionConsumableQueuedAutonomyCommands, +} from '../autonomyQueueLifecycle' +import { + enqueue, + getCommandsByMaxPriority, + remove as removeFromQueue, + resetCommandQueue, +} from '../messageQueueManager' + +let tempDir = '' +let extraTempDirs: string[] = [] + +beforeEach(async () => { + tempDir = await createTempDir('autonomy-queue-lifecycle-') + extraTempDirs = [] + resetCommandQueue() +}) + +afterEach(async () => { + resetCommandQueue() + if (tempDir) { + await cleanupTempDir(tempDir) + } + for (const extraTempDir of extraTempDirs) { + await cleanupTempDir(extraTempDir) + } +}) + +describe('autonomyQueueLifecycle', () => { + async function consumeQueuedAutonomyAttachmentTurn() { + const previousDisableAttachments = + process.env.CLAUDE_CODE_DISABLE_ATTACHMENTS + process.env.CLAUDE_CODE_DISABLE_ATTACHMENTS = '1' + try { + const snapshot = getCommandsByMaxPriority('later') + const claim = await claimConsumableQueuedAutonomyCommands( + snapshot, + tempDir, + ) + removeFromQueue(claim.staleCommands) + removeFromQueue(claim.claimedCommands) + + const attachments = [] + for await (const attachment of getAttachmentMessages( + null, + {} as never, + null, + claim.attachmentCommands, + [], + )) { + attachments.push(attachment) + } + + const consumedCommands = claim.attachmentCommands.filter( + command => + (command.mode === 'prompt' || command.mode === 'task-notification') && + !claim.claimedCommands.includes(command), + ) + removeFromQueue(consumedCommands) + const nextCommands = await finalizeAutonomyCommandsForTurn({ + commands: claim.claimedCommands, + outcome: { type: 'completed' }, + currentDir: tempDir, + priority: 'later', + }) + for (const command of nextCommands) { + enqueue(command) + } + + return { attachments, runningRunIds: claim.claimedRunIds, nextCommands } + } finally { + if (previousDisableAttachments === undefined) { + delete process.env.CLAUDE_CODE_DISABLE_ATTACHMENTS + } else { + process.env.CLAUDE_CODE_DISABLE_ATTACHMENTS = previousDisableAttachments + } + } + } + + test('filters stale autonomy commands before mid-turn attachment consumption', async () => { + const command = await createAutonomyQueuedPrompt({ + basePrompt: 'scheduled prompt', + trigger: 'scheduled-task', + rootDir: tempDir, + currentDir: tempDir, + }) + expect(command).not.toBeNull() + + const initial = await partitionConsumableQueuedAutonomyCommands( + [command!], + tempDir, + ) + expect(initial.attachmentCommands).toHaveLength(1) + expect(initial.staleCommands).toHaveLength(0) + + await markAutonomyRunCancelled(command!.autonomy!.runId, tempDir) + + const afterCancel = await partitionConsumableQueuedAutonomyCommands( + [command!], + tempDir, + ) + expect(afterCancel.attachmentCommands).toHaveLength(0) + expect(afterCancel.staleCommands).toHaveLength(1) + }) + + test('cancels proactive commands that are created but dropped before enqueue', async () => { + const commands = await createProactiveAutonomyCommands({ + basePrompt: '12:00:00', + rootDir: tempDir, + currentDir: tempDir, + }) + expect(commands).toHaveLength(1) + + const queuedRun = await getAutonomyRunById( + commands[0]!.autonomy!.runId, + tempDir, + ) + expect(queuedRun!.status).toBe('queued') + + await cancelQueuedAutonomyCommands({ commands, rootDir: tempDir }) + + const cancelledRun = await getAutonomyRunById( + commands[0]!.autonomy!.runId, + tempDir, + ) + expect(cancelledRun!.status).toBe('cancelled') + }) + + test('uses command rootDir when claiming after project context changes', async () => { + const otherProjectDir = await createTempDir('autonomy-other-project-') + extraTempDirs.push(otherProjectDir) + const command = await createAutonomyQueuedPrompt({ + basePrompt: 'scheduled prompt', + trigger: 'scheduled-task', + rootDir: tempDir, + currentDir: tempDir, + }) + expect(command).not.toBeNull() + expect(command!.autonomy?.rootDir).toBe(tempDir) + + const claim = await claimConsumableQueuedAutonomyCommands( + [command!], + otherProjectDir, + ) + + const originalRun = await getAutonomyRunById( + command!.autonomy!.runId, + tempDir, + ) + const wrongProjectRun = await getAutonomyRunById( + command!.autonomy!.runId, + otherProjectDir, + ) + + expect(claim.claimedRunIds).toEqual([command!.autonomy!.runId]) + expect(claim.attachmentCommands).toHaveLength(1) + expect(originalRun!.status).toBe('running') + expect(wrongProjectRun).toBeNull() + }) + + test('advances a managed flow consumed as a queued attachment', async () => { + const command = await startManagedAutonomyFlowFromHeartbeatTask({ + task: { + name: 'weekly-report', + interval: '7d', + prompt: 'Ship the weekly report', + steps: [ + { name: 'gather', prompt: 'Gather weekly inputs' }, + { name: 'draft', prompt: 'Draft weekly report' }, + ], + }, + rootDir: tempDir, + currentDir: tempDir, + }) + expect(command).not.toBeNull() + + const claim = await claimConsumableQueuedAutonomyCommands( + [command!], + tempDir, + ) + const runningRunIds = claim.claimedRunIds + expect(runningRunIds).toEqual([command!.autonomy!.runId]) + + const nextCommands = await finalizeAutonomyCommandsForTurn({ + commands: claim.claimedCommands, + outcome: { type: 'completed' }, + currentDir: tempDir, + priority: 'later', + }) + const [flow] = await listAutonomyFlows(tempDir) + const detail = await getAutonomyFlowById(flow!.flowId, tempDir) + const run = await getAutonomyRunById(command!.autonomy!.runId, tempDir) + + expect(run!.status).toBe('completed') + expect(nextCommands).toHaveLength(1) + expect(nextCommands[0]!.autonomy?.flowId).toBe(flow!.flowId) + expect(detail!.stateJson!.steps.map(step => step.status)).toEqual([ + 'completed', + 'queued', + ]) + }) + + test('keeps managed autonomy flow coherent across queued attachment turns', async () => { + const firstCommand = await startManagedAutonomyFlowFromHeartbeatTask({ + task: { + name: 'weekly-report', + interval: '7d', + prompt: 'Ship the weekly report', + steps: [ + { name: 'gather', prompt: 'Gather weekly inputs' }, + { name: 'draft', prompt: 'Draft weekly report' }, + ], + }, + rootDir: tempDir, + currentDir: tempDir, + }) + expect(firstCommand).not.toBeNull() + enqueue(firstCommand!) + + const firstTurn = await consumeQueuedAutonomyAttachmentTurn() + const queuedAfterFirstTurn = getCommandsByMaxPriority('later') + const [flowAfterFirstTurn] = await listAutonomyFlows(tempDir) + const firstRun = await getAutonomyRunById( + firstCommand!.autonomy!.runId, + tempDir, + ) + + expect(firstTurn.attachments).toHaveLength(1) + expect(firstTurn.attachments[0]!.attachment?.type).toBe('queued_command') + expect(firstTurn.runningRunIds).toEqual([firstCommand!.autonomy!.runId]) + expect(firstTurn.nextCommands).toHaveLength(1) + expect(queuedAfterFirstTurn).toHaveLength(1) + expect(queuedAfterFirstTurn[0]!.autonomy?.flowId).toBe( + flowAfterFirstTurn!.flowId, + ) + expect(firstRun!.status).toBe('completed') + expect( + flowAfterFirstTurn!.stateJson!.steps.map(step => step.status), + ).toEqual(['completed', 'queued']) + + const secondCommand = queuedAfterFirstTurn[0]! + const secondTurn = await consumeQueuedAutonomyAttachmentTurn() + const queuedAfterSecondTurn = getCommandsByMaxPriority('later') + const finalFlow = await getAutonomyFlowById( + flowAfterFirstTurn!.flowId, + tempDir, + ) + const secondRun = await getAutonomyRunById( + secondCommand.autonomy!.runId, + tempDir, + ) + + expect(secondTurn.attachments).toHaveLength(1) + expect(secondTurn.runningRunIds).toEqual([secondCommand.autonomy!.runId]) + expect(secondTurn.nextCommands).toHaveLength(0) + expect(queuedAfterSecondTurn).toHaveLength(0) + expect(secondRun!.status).toBe('completed') + expect(finalFlow!.status).toBe('succeeded') + expect(finalFlow!.stateJson!.steps.map(step => step.status)).toEqual([ + 'completed', + 'completed', + ]) + }) +}) diff --git a/src/utils/__tests__/autonomyRuns.test.ts b/src/utils/__tests__/autonomyRuns.test.ts index 056083e486..268b856fd0 100644 --- a/src/utils/__tests__/autonomyRuns.test.ts +++ b/src/utils/__tests__/autonomyRuns.test.ts @@ -1,6 +1,5 @@ import { afterEach, beforeEach, describe, expect, test } from 'bun:test' -import { mkdir, writeFile } from 'fs/promises' -import { join } from 'path' +import { join, resolve as resolvePath } from 'node:path' import { resetStateForTests, setCwdState, @@ -8,17 +7,23 @@ import { setProjectRoot, } from '../../bootstrap/state' import { + createAutonomyRun, formatAutonomyRunsList, formatAutonomyRunsStatus, listAutonomyRuns, createAutonomyQueuedPrompt, + createAutonomyQueuedPromptIfNoActiveSource, createProactiveAutonomyCommands, finalizeAutonomyRunCompleted, + getAutonomyRunById, + hasActiveAutonomyRunForSource, markAutonomyRunCompleted, + markAutonomyRunCancelled, markAutonomyRunFailed, markAutonomyRunRunning, recoverManagedAutonomyFlowPrompt, resolveAutonomyRunsPath, + STALE_ACTIVE_RUN_ERROR_PREFIX, startManagedAutonomyFlowFromHeartbeatTask, } from '../autonomyRuns' import { @@ -35,11 +40,14 @@ import { cleanupTempDir, createTempDir, createTempSubdir, + readTempFile, + tempPathExists, writeTempFile, } from '../../../tests/mocks/file-system' const AGENTS_REL = join(AUTONOMY_DIR, 'AGENTS.md') const HEARTBEAT_REL = join(AUTONOMY_DIR, 'HEARTBEAT.md') +const RUNS_REL = join(AUTONOMY_DIR, 'runs.json') let tempDir = '' @@ -95,7 +103,9 @@ describe('autonomyRuns', () => { ownerKey: 'main-thread', sourceId: 'cron-1', sourceLabel: 'nightly-report', + ownerProcessId: process.pid, }) + expect(runs[0]?.ownerSessionId).toBeString() expect(flows).toHaveLength(0) expect(resolveAutonomyRunsPath(tempDir)).toContain('.claude') }) @@ -118,7 +128,7 @@ describe('autonomyRuns', () => { expect(command!.value).toContain('nested authority') }) - test('markAutonomyRunRunning/completed/failed update persisted lifecycle state for plain runs', async () => { + test('markAutonomyRunRunning/completed update persisted lifecycle state for plain runs', async () => { const command = await createAutonomyQueuedPrompt({ basePrompt: '12:00:00', trigger: 'proactive-tick', @@ -134,7 +144,9 @@ describe('autonomyRuns', () => { runId, status: 'running', startedAt: 100, + ownerProcessId: process.pid, }) + expect(runs[0]?.ownerSessionId).toBeString() await markAutonomyRunCompleted(runId, tempDir, 200) runs = await listAutonomyRuns(tempDir) @@ -143,9 +155,22 @@ describe('autonomyRuns', () => { status: 'completed', endedAt: 200, }) + }) + test('markAutonomyRunFailed updates a non-terminal run', async () => { + const command = await createAutonomyQueuedPrompt({ + basePrompt: '12:00:00', + trigger: 'proactive-tick', + rootDir: tempDir, + currentDir: tempDir, + }) + expect(command).not.toBeNull() + const runId = command!.autonomy!.runId + + await markAutonomyRunRunning(runId, tempDir, 100) await markAutonomyRunFailed(runId, 'boom', tempDir, 300) - runs = await listAutonomyRuns(tempDir) + const runs = await listAutonomyRuns(tempDir) + expect(runs[0]).toMatchObject({ runId, status: 'failed', @@ -154,6 +179,346 @@ describe('autonomyRuns', () => { }) }) + test('terminal runs are not revived by stale lifecycle updates', async () => { + const command = await createAutonomyQueuedPrompt({ + basePrompt: 'scheduled prompt', + trigger: 'scheduled-task', + rootDir: tempDir, + currentDir: tempDir, + }) + expect(command).not.toBeNull() + const runId = command!.autonomy!.runId + + await markAutonomyRunCancelled(runId, tempDir, 100) + const revived = await markAutonomyRunRunning(runId, tempDir, 200) + const completed = await markAutonomyRunCompleted(runId, tempDir, 300) + const failed = await markAutonomyRunFailed( + runId, + 'late failure', + tempDir, + 400, + ) + const persisted = await getAutonomyRunById(runId, tempDir) + + expect(revived).toBeNull() + expect(completed).toBeNull() + expect(failed).toBeNull() + expect(persisted).toMatchObject({ + status: 'cancelled', + endedAt: 100, + }) + expect(persisted!.error).toBeUndefined() + }) + + test('hasActiveAutonomyRunForSource only treats queued and running scheduled runs as active', async () => { + const command = await createAutonomyQueuedPrompt({ + basePrompt: 'scheduled prompt', + trigger: 'scheduled-task', + rootDir: tempDir, + currentDir: tempDir, + sourceId: 'cron-1', + sourceLabel: 'nightly', + }) + expect(command).not.toBeNull() + const runId = command!.autonomy!.runId + + await expect( + hasActiveAutonomyRunForSource({ + trigger: 'scheduled-task', + sourceId: 'cron-1', + rootDir: tempDir, + }), + ).resolves.toBe(true) + + await markAutonomyRunRunning(runId, tempDir, 100) + await expect( + hasActiveAutonomyRunForSource({ + trigger: 'scheduled-task', + sourceId: 'cron-1', + rootDir: tempDir, + }), + ).resolves.toBe(true) + + await expect( + hasActiveAutonomyRunForSource({ + trigger: 'scheduled-task', + sourceId: 'cron-2', + rootDir: tempDir, + }), + ).resolves.toBe(false) + + await markAutonomyRunCompleted(runId, tempDir, 200) + await expect( + hasActiveAutonomyRunForSource({ + trigger: 'scheduled-task', + sourceId: 'cron-1', + rootDir: tempDir, + }), + ).resolves.toBe(false) + + const failedCommand = await createAutonomyQueuedPrompt({ + basePrompt: 'scheduled prompt', + trigger: 'scheduled-task', + rootDir: tempDir, + currentDir: tempDir, + sourceId: 'cron-1', + }) + expect(failedCommand).not.toBeNull() + await markAutonomyRunFailed( + failedCommand!.autonomy!.runId, + 'boom', + tempDir, + 300, + ) + await expect( + hasActiveAutonomyRunForSource({ + trigger: 'scheduled-task', + sourceId: 'cron-1', + rootDir: tempDir, + }), + ).resolves.toBe(false) + }) + + test('createAutonomyQueuedPromptIfNoActiveSource atomically skips duplicate active scheduled sources', async () => { + const [first, second] = await Promise.all([ + createAutonomyQueuedPromptIfNoActiveSource({ + basePrompt: 'scheduled prompt', + trigger: 'scheduled-task', + rootDir: tempDir, + currentDir: tempDir, + sourceId: 'cron-1', + }), + createAutonomyQueuedPromptIfNoActiveSource({ + basePrompt: 'scheduled prompt', + trigger: 'scheduled-task', + rootDir: tempDir, + currentDir: tempDir, + sourceId: 'cron-1', + }), + ]) + + const created = [first, second].filter(command => command !== null) + const runs = await listAutonomyRuns(tempDir) + + expect(created).toHaveLength(1) + expect(runs).toHaveLength(1) + expect(runs[0]).toMatchObject({ + trigger: 'scheduled-task', + status: 'queued', + sourceId: 'cron-1', + }) + }) + + test('createAutonomyQueuedPromptIfNoActiveSource scopes dedup by ownerKey', async () => { + const first = await createAutonomyQueuedPromptIfNoActiveSource({ + basePrompt: 'scheduled prompt', + trigger: 'scheduled-task', + rootDir: tempDir, + currentDir: tempDir, + sourceId: 'cron-1', + ownerKey: 'owner-a', + }) + const second = await createAutonomyQueuedPromptIfNoActiveSource({ + basePrompt: 'scheduled prompt', + trigger: 'scheduled-task', + rootDir: tempDir, + currentDir: tempDir, + sourceId: 'cron-1', + ownerKey: 'owner-b', + }) + + const runs = await listAutonomyRuns(tempDir) + + expect(first).not.toBeNull() + expect(second).not.toBeNull() + expect(runs).toHaveLength(2) + expect(new Set(runs.map(run => run.ownerKey))).toEqual( + new Set(['owner-a', 'owner-b']), + ) + }) + + test('createAutonomyQueuedPromptIfNoActiveSource does not advance heartbeat last-run state on dedup skip (two-phase commit invariant)', async () => { + await writeTempFile( + tempDir, + HEARTBEAT_REL, + [ + 'tasks:', + ' - name: inbox', + ' interval: 30m', + ' prompt: "Check inbox"', + ].join('\n'), + ) + + // Seed an active queued run for cron-1 so the next dedup attempt skips. + await writeTempFile( + tempDir, + RUNS_REL, + `${JSON.stringify( + { + runs: [ + { + runId: 'preexisting-active', + runtime: 'automatic', + trigger: 'scheduled-task', + status: 'queued', + rootDir: tempDir, + currentDir: tempDir, + sourceId: 'cron-1', + promptPreview: 'still queued', + createdAt: 100, + ownerProcessId: process.pid, + ownerSessionId: 'self', + }, + ], + }, + null, + 2, + )}\n`, + ) + + const skipped = await createAutonomyQueuedPromptIfNoActiveSource({ + basePrompt: 'scheduled prompt', + trigger: 'scheduled-task', + rootDir: tempDir, + currentDir: tempDir, + sourceId: 'cron-1', + }) + expect(skipped).toBeNull() + + // If the dedup skip wrongly advanced heartbeat state, the next + // proactive-tick prompt would NOT include the inbox task. Verify it + // still does. + const followUp = await createAutonomyQueuedPrompt({ + basePrompt: '12:00:00', + trigger: 'proactive-tick', + rootDir: tempDir, + currentDir: tempDir, + }) + expect(followUp).not.toBeNull() + expect(followUp!.value).toContain('Due HEARTBEAT.md tasks:') + expect(followUp!.value).toContain('- inbox (30m): Check inbox') + }) + + test('createAutonomyQueuedPromptIfNoActiveSource recovers stale active runs from dead owner processes', async () => { + await writeTempFile( + tempDir, + RUNS_REL, + `${JSON.stringify( + { + runs: [ + { + runId: 'stale-run', + runtime: 'automatic', + trigger: 'scheduled-task', + status: 'running', + rootDir: tempDir, + currentDir: tempDir, + sourceId: 'cron-1', + sourceLabel: 'nightly', + promptPreview: 'stale scheduled prompt', + createdAt: 100, + startedAt: 100, + ownerProcessId: 2_147_483_647, + ownerSessionId: 'dead-session', + }, + ], + }, + null, + 2, + )}\n`, + ) + + await expect( + hasActiveAutonomyRunForSource({ + trigger: 'scheduled-task', + sourceId: 'cron-1', + rootDir: tempDir, + }), + ).resolves.toBe(false) + + const command = await createAutonomyQueuedPromptIfNoActiveSource({ + basePrompt: 'scheduled prompt', + trigger: 'scheduled-task', + rootDir: tempDir, + currentDir: tempDir, + sourceId: 'cron-1', + }) + const runs = await listAutonomyRuns(tempDir) + + expect(command).not.toBeNull() + expect(runs).toHaveLength(2) + expect(runs[0]).toMatchObject({ + trigger: 'scheduled-task', + status: 'queued', + sourceId: 'cron-1', + ownerProcessId: process.pid, + }) + expect(runs[1]).toMatchObject({ + runId: 'stale-run', + status: 'failed', + endedAt: runs[0]?.createdAt, + error: expect.stringContaining('owner process 2147483647'), + }) + }) + + test('stale managed-flow run recovery also marks the flow step failed', async () => { + const command = await startManagedAutonomyFlowFromHeartbeatTask({ + task: { + name: 'weekly-report', + interval: '7d', + prompt: 'Ship the weekly report', + steps: [ + { + name: 'gather', + prompt: 'Gather weekly inputs', + }, + ], + }, + rootDir: tempDir, + currentDir: tempDir, + }) + expect(command).not.toBeNull() + const runId = command!.autonomy!.runId + await markAutonomyRunRunning(runId, tempDir, 100) + + const runsPath = resolveAutonomyRunsPath(tempDir) + const file = JSON.parse(await readTempFile(runsPath)) as { + runs: Array> + } + file.runs = file.runs.map(run => + run.runId === runId + ? { ...run, ownerProcessId: 2_147_483_647 } + : run, + ) + await writeTempFile(tempDir, RUNS_REL, `${JSON.stringify(file, null, 2)}\n`) + + const replacement = await createAutonomyQueuedPromptIfNoActiveSource({ + basePrompt: 'replacement prompt', + trigger: 'managed-flow-step', + rootDir: tempDir, + currentDir: tempDir, + sourceId: command!.autonomy!.sourceId!, + ownerKey: 'main-thread', + }) + const [flow] = await listAutonomyFlows(tempDir) + const runs = await listAutonomyRuns(tempDir) + + expect(replacement).not.toBeNull() + expect(runs.find(run => run.runId === runId)).toMatchObject({ + status: 'failed', + error: expect.stringContaining(STALE_ACTIVE_RUN_ERROR_PREFIX), + }) + expect(flow).toMatchObject({ + status: 'failed', + blockedRunId: runId, + }) + expect(flow?.stateJson?.steps[0]).toMatchObject({ + status: 'failed', + runId, + error: expect.stringContaining(STALE_ACTIVE_RUN_ERROR_PREFIX), + }) + }) + test('formatters produce readable status and run listings', async () => { const first = await createAutonomyQueuedPrompt({ basePrompt: 'scheduled prompt', @@ -223,11 +588,56 @@ describe('autonomyRuns', () => { ) }) + test('persistence pruning keeps active runs ahead of recent completed history', async () => { + const runs = [ + { + runId: 'old-active', + runtime: 'automatic', + trigger: 'scheduled-task', + status: 'queued', + rootDir: tempDir, + currentDir: tempDir, + ownerKey: 'main-thread', + promptPreview: 'old active', + createdAt: 1, + }, + ...Array.from({ length: 200 }, (_, index) => ({ + runId: `history-${index}`, + runtime: 'automatic', + trigger: 'scheduled-task', + status: 'completed', + rootDir: tempDir, + currentDir: tempDir, + ownerKey: 'main-thread', + promptPreview: `history ${index}`, + createdAt: 1_000 + index, + endedAt: 2_000 + index, + })), + ] + await writeTempFile( + tempDir, + RUNS_REL, + `${JSON.stringify({ runs }, null, 2)}\n`, + ) + + await createAutonomyRun({ + trigger: 'scheduled-task', + prompt: 'fresh active', + rootDir: tempDir, + currentDir: tempDir, + nowMs: 9_999, + }) + + const persisted = await listAutonomyRuns(tempDir) + expect(persisted).toHaveLength(200) + expect(persisted.some(run => run.runId === 'old-active')).toBe(true) + expect(persisted.some(run => run.runId === 'history-0')).toBe(false) + }) + test('listAutonomyRuns keeps older persisted records by normalizing missing runtime and owner metadata', async () => { - const runsPath = resolveAutonomyRunsPath(tempDir) - await mkdir(join(tempDir, '.claude', 'autonomy'), { recursive: true }) - await writeFile( - runsPath, + await writeTempFile( + tempDir, + RUNS_REL, `${JSON.stringify( { runs: [ @@ -244,7 +654,6 @@ describe('autonomyRuns', () => { null, 2, )}\n`, - 'utf-8', ) const [legacy] = await listAutonomyRuns(tempDir) @@ -418,4 +827,27 @@ describe('autonomyRuns', () => { expect(recovered!.autonomy?.runId).toBe(command!.autonomy?.runId) expect(recovered!.autonomy?.flowId).toBe(flow!.flowId) }) + + test('STALE_ACTIVE_RUN_ERROR_PREFIX stays in sync with HEARTBEAT.md stale-recovery-health task', async () => { + // The HEARTBEAT.md stale-recovery-health task prompt embeds this prefix + // as a literal string. Changing the constant without updating the + // heartbeat prompt would silently break the monitor — this test fails + // first to force the simultaneous update. + const heartbeatPath = resolvePath( + import.meta.dir, + '..', + '..', + '..', + '.claude', + 'autonomy', + 'HEARTBEAT.md', + ) + if (!(await tempPathExists(heartbeatPath))) { + // .claude/ may be absent in some checkout layouts (e.g., shallow clone + // for npm pack). Skip rather than fail in that case. + return + } + const content = await readTempFile(heartbeatPath) + expect(content).toContain(STALE_ACTIVE_RUN_ERROR_PREFIX) + }) }) diff --git a/src/utils/autonomyAuthority.ts b/src/utils/autonomyAuthority.ts index c604d30499..cd5326f602 100644 --- a/src/utils/autonomyAuthority.ts +++ b/src/utils/autonomyAuthority.ts @@ -133,11 +133,50 @@ function mergeAgentsAuthority(files: AutonomyAuthorityFile[]): string | null { .join('\n\n') } +/** + * Replaces fenced code-block content (and the ``` / ~~~ fence delimiters + * themselves) with empty strings while preserving the index of every + * other line. Used by the heartbeat parser so that `tasks:` literals + * appearing inside Markdown code samples in HEARTBEAT.md docs do not + * collide with the real config block. + */ +function maskCodeFencedLines(lines: string[]): string[] { + const masked = lines.slice() + let activeFenceChar: '`' | '~' | null = null + let activeFenceLen = 0 + for (let i = 0; i < masked.length; i++) { + const trimmed = masked[i]!.trim() + const fenceMatch = trimmed.match(/^([`~])\1{2,}/) + if (fenceMatch) { + const fenceChar = fenceMatch[1]! as '`' | '~' + const fenceLen = fenceMatch[0]!.length + const trailing = trimmed.slice(fenceLen) + if (activeFenceChar === null) { + activeFenceChar = fenceChar + activeFenceLen = fenceLen + } else if ( + activeFenceChar === fenceChar && + fenceLen >= activeFenceLen && + trailing.trim() === '' + ) { + activeFenceChar = null + activeFenceLen = 0 + } + masked[i] = '' + continue + } + if (activeFenceChar !== null) { + masked[i] = '' + } + } + return masked +} + export function parseHeartbeatAuthorityTasks( content: string, ): HeartbeatAuthorityTask[] { const tasks: HeartbeatAuthorityTask[] = [] - const lines = content.split('\n') + const lines = maskCodeFencedLines(content.split('\n')) const getIndent = (line: string): number => line.length - line.trimStart().length const parseScalar = (line: string, key: string): string => diff --git a/src/utils/autonomyFlows.ts b/src/utils/autonomyFlows.ts index c67d5c0584..989dd851f1 100644 --- a/src/utils/autonomyFlows.ts +++ b/src/utils/autonomyFlows.ts @@ -3,7 +3,10 @@ import { mkdir, writeFile } from 'fs/promises' import { dirname, join, resolve } from 'path' import { getProjectRoot } from '../bootstrap/state.js' import { AUTONOMY_DIR, type AutonomyTriggerKind } from './autonomyAuthority.js' -import { withAutonomyPersistenceLock } from './autonomyPersistence.js' +import { + retainActiveFirst, + withAutonomyPersistenceLock, +} from './autonomyPersistence.js' import { getFsImplementation } from './fsOperations.js' const AUTONOMY_FLOWS_MAX = 100 @@ -83,6 +86,20 @@ export type AutonomyFlowRecord = { waitJson?: AutonomyFlowWaitState cancelRequestedAt?: number lastError?: string + /** + * Repo-relative POSIX glob patterns describing which paths this flow's + * `report`-step approval covers. The pre-tool-use hook + * `require-plan-for-risky-edit.mjs` consults this list to permit edits + * only when the target file matches at least one entry. Absent or empty + * means "no boundary declared" — during the pilot window the hook + * treats this as broad approval (v1 behaviour). Once all production + * flows declare boundaries, the hook will deny absent-boundary flows. + * + * Supported syntax: `*` matches one path segment, `**` matches any + * number including zero. Examples: `src/utils/autonomy*`, + * `src/services/api/**`, `src/Tool.ts`. + */ + boundary?: string[] } type AutonomyFlowsFile = { @@ -138,6 +155,7 @@ function cloneWaitState( function cloneFlowRecord(flow: AutonomyFlowRecord): AutonomyFlowRecord { return { ...flow, + ...(flow.boundary ? { boundary: [...flow.boundary] } : {}), ...(flow.stateJson ? { stateJson: cloneManagedState(flow.stateJson) } : {}), ...(flow.waitJson ? { waitJson: cloneWaitState(flow.waitJson) } : {}), } @@ -152,6 +170,17 @@ function isManagedFlowStatusActive(status: AutonomyFlowStatus): boolean { ) } +function selectPersistedAutonomyFlows( + flows: AutonomyFlowRecord[], +): AutonomyFlowRecord[] { + return retainActiveFirst( + flows.map(cloneFlowRecord), + flow => isManagedFlowStatusActive(flow.status), + flow => flow.updatedAt, + AUTONOMY_FLOWS_MAX, + ) +} + function defaultFlowSource(params: { trigger: AutonomyTriggerKind sourceId?: string @@ -237,6 +266,35 @@ function normalizeWaitState(value: unknown): AutonomyFlowWaitState | undefined { } } +function isPosixBoundaryGlob(value: string): boolean { + if (!value || value.startsWith('/') || value.includes('\\')) { + return false + } + if (value.includes('\0')) { + return false + } + return !value.split('/').some(segment => segment === '..') +} + +function normalizeBoundary(value: unknown): string[] | undefined { + if (!Array.isArray(value)) { + return undefined + } + const seen = new Set() + const boundary = value + .filter((entry): entry is string => typeof entry === 'string') + .map(entry => entry.trim()) + .filter(isPosixBoundaryGlob) + .filter(entry => { + if (seen.has(entry)) { + return false + } + seen.add(entry) + return true + }) + return boundary.length > 0 ? boundary : undefined +} + function normalizeFlowRecord(flow: AutonomyFlowRecord): AutonomyFlowRecord { const source = defaultFlowSource(flow) return { @@ -247,6 +305,7 @@ function normalizeFlowRecord(flow: AutonomyFlowRecord): AutonomyFlowRecord { goal: flow.goal || flow.sourceLabel || flow.sourceId || flow.flowKey, currentDir: flow.currentDir || flow.rootDir, runCount: Math.max(flow.runCount ?? 0, 0), + boundary: normalizeBoundary(flow.boundary), stateJson: normalizeManagedState(flow.stateJson), waitJson: normalizeWaitState(flow.waitJson), ...(flow.sourceId @@ -369,11 +428,7 @@ async function writeAutonomyFlows( path, `${JSON.stringify( { - flows: flows - .slice() - .map(cloneFlowRecord) - .sort((left, right) => right.updatedAt - left.updatedAt) - .slice(0, AUTONOMY_FLOWS_MAX), + flows: selectPersistedAutonomyFlows(flows), } satisfies AutonomyFlowsFile, null, 2, @@ -420,6 +475,7 @@ export async function startManagedAutonomyFlow(params: { ownerKey?: string sourceId?: string sourceLabel?: string + boundary?: string[] nowMs?: number }): Promise { if (params.steps.length === 0) { @@ -450,6 +506,8 @@ export async function startManagedAutonomyFlow(params: { const stateJson = buildManagedState(params.steps) const firstStep = stateJson.steps[0]! + const boundary = + normalizeBoundary(params.boundary) ?? normalizeBoundary(current?.boundary) const waiting = firstStep.waitFor != null ? { @@ -474,6 +532,7 @@ export async function startManagedAutonomyFlow(params: { currentDir, ...(source.sourceId ? { sourceId: source.sourceId } : {}), ...(source.sourceLabel ? { sourceLabel: source.sourceLabel } : {}), + ...(boundary ? { boundary } : {}), latestRunId: undefined, runCount: current?.runCount ?? 0, createdAt: current?.createdAt ?? nowMs, diff --git a/src/utils/autonomyPersistence.ts b/src/utils/autonomyPersistence.ts index 0a0ebef05e..4085a1a9a3 100644 --- a/src/utils/autonomyPersistence.ts +++ b/src/utils/autonomyPersistence.ts @@ -4,6 +4,42 @@ import { lock } from './lockfile.js' const persistenceLocks = new Map>() +/** + * Two-phase persistence retention. Active records (queued/running, etc.) are + * always kept — capping them risks evicting in-flight work; that responsibility + * lives in caller-side leak detection. Inactive (terminal) records are ranked + * by `getTimestamp` desc and capped to fill the remaining budget below `max`. + * + * Returned list is sorted by `getTimestamp` desc regardless of activity, so + * the persisted file is plain reverse-chronological order — listings/UI can + * consume it directly without re-sorting. + */ +export function retainActiveFirst( + records: readonly T[], + isActive: (record: T) => boolean, + getTimestamp: (record: T) => number, + max: number, +): T[] { + const sortDesc = (left: T, right: T) => + getTimestamp(right) - getTimestamp(left) + const active = records.filter(isActive).slice().sort(sortDesc) + const history = records + .filter(record => !isActive(record)) + .slice() + .sort(sortDesc) + .slice(0, Math.max(0, max - active.length)) + return [...active, ...history].sort(sortDesc) +} + +export function getAutonomyPersistenceLockCountForTests(): number { + if (process.env.NODE_ENV !== 'test') { + throw new Error( + 'getAutonomyPersistenceLockCountForTests can only be called in tests', + ) + } + return persistenceLocks.size +} + export async function withAutonomyPersistenceLock( rootDir: string, fn: () => Promise, @@ -16,10 +52,8 @@ export async function withAutonomyPersistenceLock( const current = new Promise(resolve => { release = resolve }) - persistenceLocks.set( - key, - previous.then(() => current), - ) + const chained = previous.then(() => current) + persistenceLocks.set(key, chained) await previous try { @@ -41,7 +75,7 @@ export async function withAutonomyPersistenceLock( } } finally { release() - if (persistenceLocks.get(key) === current) { + if (persistenceLocks.get(key) === chained) { persistenceLocks.delete(key) } } diff --git a/src/utils/autonomyQueueLifecycle.ts b/src/utils/autonomyQueueLifecycle.ts new file mode 100644 index 0000000000..efc796b6a0 --- /dev/null +++ b/src/utils/autonomyQueueLifecycle.ts @@ -0,0 +1,261 @@ +import type { QueuedCommand } from '../types/textInputTypes.js' +import { + finalizeAutonomyRunCompleted, + finalizeAutonomyRunFailed, + listAutonomyRuns, + markAutonomyRunCancelled, + markAutonomyRunRunning, +} from './autonomyRuns.js' + +export type AutonomyQueuePartition = { + attachmentCommands: QueuedCommand[] + staleCommands: QueuedCommand[] +} + +export type AutonomyQueueClaim = AutonomyQueuePartition & { + claimedRunIds: string[] + claimedCommands: QueuedCommand[] +} + +export type AutonomyTurnOutcome = + | { type: 'completed' } + | { type: 'cancelled' } + | { type: 'failed'; error?: unknown; message?: string } + +type AutonomyRunRef = { + runId: string + rootDir?: string +} + +function getCommandRootDir( + command: QueuedCommand, + fallbackRootDir?: string, +): string | undefined { + return command.autonomy?.rootDir ?? fallbackRootDir +} + +function refKey(ref: AutonomyRunRef): string { + return `${ref.rootDir ?? ''}\0${ref.runId}` +} + +function getAutonomyRunRefs( + commands: QueuedCommand[], + fallbackRootDir?: string, +): AutonomyRunRef[] { + const refs = new Map() + for (const command of commands) { + const runId = command.autonomy?.runId + if (!runId) { + continue + } + const ref = { + runId, + rootDir: getCommandRootDir(command, fallbackRootDir), + } + refs.set(refKey(ref), ref) + } + return [...refs.values()] +} + +function isInlineQueuedCommand(command: QueuedCommand): boolean { + return command.mode === 'prompt' || command.mode === 'task-notification' +} + +function groupRefsByRootDir( + refs: AutonomyRunRef[], +): Map { + const grouped = new Map() + for (const ref of refs) { + const key = ref.rootDir ?? '' + const group = grouped.get(key) + if (group) { + group.push(ref) + } else { + grouped.set(key, [ref]) + } + } + return grouped +} + +/** + * Exclude queued autonomy commands whose persisted run is no longer queued. + * This prevents stale in-memory commands from reviving flows after cancellation + * or after another path has already consumed the run. + */ +export async function partitionConsumableQueuedAutonomyCommands( + commands: QueuedCommand[], + rootDir?: string, +): Promise { + const attachmentCommands: QueuedCommand[] = [] + const staleCommands: QueuedCommand[] = [] + const refs = getAutonomyRunRefs(commands, rootDir) + const runsByRef = new Map< + string, + Awaited>[number] + >() + for (const [rootKey, group] of groupRefsByRootDir(refs)) { + const runs = await listAutonomyRuns(rootKey || undefined) + const wanted = new Set(group.map(ref => ref.runId)) + for (const run of runs) { + if (wanted.has(run.runId)) { + runsByRef.set( + refKey({ runId: run.runId, rootDir: rootKey || undefined }), + run, + ) + } + } + } + + for (const command of commands) { + const runId = command.autonomy?.runId + if (!runId) { + attachmentCommands.push(command) + continue + } + + const commandRootDir = getCommandRootDir(command, rootDir) + const run = runsByRef.get(refKey({ runId, rootDir: commandRootDir })) + if (run?.status === 'queued' && !run.startedAt && !run.endedAt) { + attachmentCommands.push(command) + } else { + staleCommands.push(command) + } + } + + return { attachmentCommands, staleCommands } +} + +export async function claimConsumableQueuedAutonomyCommands( + commands: QueuedCommand[], + rootDir?: string, +): Promise { + const partition = await partitionConsumableQueuedAutonomyCommands( + commands, + rootDir, + ) + const claimedRunIds: string[] = [] + const claimedRunKeys: string[] = [] + const staleRunKeys = new Set() + const candidateRefs = getAutonomyRunRefs( + partition.attachmentCommands.filter(isInlineQueuedCommand), + rootDir, + ) + + for (const ref of candidateRefs) { + const updated = await markAutonomyRunRunning(ref.runId, ref.rootDir) + if (updated?.status === 'running') { + claimedRunIds.push(ref.runId) + claimedRunKeys.push(refKey(ref)) + } else { + staleRunKeys.add(refKey(ref)) + } + } + + const claimedRunKeySet = new Set(claimedRunKeys) + const attachmentCommands: QueuedCommand[] = [] + const claimedCommands: QueuedCommand[] = [] + const staleCommands = [...partition.staleCommands] + + for (const command of partition.attachmentCommands) { + const runId = command.autonomy?.runId + if (!runId) { + attachmentCommands.push(command) + continue + } + const key = refKey({ + runId, + rootDir: getCommandRootDir(command, rootDir), + }) + if (claimedRunKeySet.has(key)) { + attachmentCommands.push(command) + claimedCommands.push(command) + } else if (staleRunKeys.has(key)) { + staleCommands.push(command) + } + } + + return { + attachmentCommands, + staleCommands, + claimedRunIds, + claimedCommands, + } +} + +export async function cancelQueuedAutonomyCommands(params: { + commands: QueuedCommand[] + rootDir?: string +}): Promise { + for (const ref of getAutonomyRunRefs(params.commands, params.rootDir)) { + await markAutonomyRunCancelled(ref.runId, ref.rootDir) + } +} + +function stringifyAutonomyError(error: unknown): string { + if (typeof error === 'string') { + return error + } + if (error instanceof Error) { + return error.message + } + return String(error) +} + +export function sanitizeAutonomyFailureForPersistence( + error: unknown, + fallback = 'query failed', +): string { + const message = stringifyAutonomyError(error) + const lower = message.toLowerCase() + if ( + lower.includes('api_error') || + lower.includes('provider') || + lower.includes('openai') || + lower.includes('gemini') || + lower.includes('grok') || + lower.includes('anthropic') || + lower.includes('bedrock') || + lower.includes('vertex') + ) { + return 'provider api_error' + } + return fallback +} + +export async function finalizeAutonomyCommandsForTurn(params: { + commands: QueuedCommand[] + outcome: AutonomyTurnOutcome + currentDir?: string + priority?: 'now' | 'next' | 'later' + workload?: string +}): Promise { + const nextCommands: QueuedCommand[] = [] + for (const command of params.commands) { + const autonomy = command.autonomy + if (!autonomy?.runId) { + continue + } + if (params.outcome.type === 'completed') { + nextCommands.push( + ...(await finalizeAutonomyRunCompleted({ + runId: autonomy.runId, + rootDir: autonomy.rootDir, + currentDir: params.currentDir, + priority: params.priority, + workload: command.workload ?? params.workload, + })), + ) + } else if (params.outcome.type === 'cancelled') { + await markAutonomyRunCancelled(autonomy.runId, autonomy.rootDir) + } else { + await finalizeAutonomyRunFailed({ + runId: autonomy.runId, + rootDir: autonomy.rootDir, + error: + params.outcome.message ?? + sanitizeAutonomyFailureForPersistence(params.outcome.error), + }) + } + } + return nextCommands +} diff --git a/src/utils/autonomyRuns.ts b/src/utils/autonomyRuns.ts index 02ce08c201..d850be9289 100644 --- a/src/utils/autonomyRuns.ts +++ b/src/utils/autonomyRuns.ts @@ -1,7 +1,7 @@ import { randomUUID } from 'crypto' import { mkdir, writeFile } from 'fs/promises' import { dirname, join, resolve } from 'path' -import { getProjectRoot } from '../bootstrap/state.js' +import { getProjectRoot, getSessionId } from '../bootstrap/state.js' import type { MessageOrigin } from '../types/message.js' import type { QueuedCommand } from '../types/textInputTypes.js' import { @@ -27,11 +27,34 @@ import { type AutonomyFlowSyncMode, type ManagedAutonomyFlowStepDefinition, } from './autonomyFlows.js' -import { withAutonomyPersistenceLock } from './autonomyPersistence.js' +import { + retainActiveFirst, + withAutonomyPersistenceLock, +} from './autonomyPersistence.js' import { getFsImplementation } from './fsOperations.js' +import { isProcessRunning } from './genericProcessUtils.js' +import { logError } from './log.js' const AUTONOMY_RUNS_MAX = 200 +// Diagnostic threshold for active (queued/running) runs. Active records are +// deliberately exempt from AUTONOMY_RUNS_MAX so a leak in finalization cannot +// silently evict in-flight work; that exemption only makes sense if a leak is +// loud when it appears. Crossing this threshold warns once per process so +// operators see the divergence in logs before runs.json grows pathologically. +const AUTONOMY_ACTIVE_RUNS_WARN_THRESHOLD = 100 +let warnedActiveRunsThresholdCrossed = false const AUTONOMY_RUNS_RELATIVE_PATH = join(AUTONOMY_DIR, 'runs.json') +// Sentinel string surfaced to operators via runs.json error fields and +// referenced literally by the HEARTBEAT.md `stale-recovery-health` task. +// A unit test asserts the HEARTBEAT.md file contains this exact prefix — +// changing the value will fail the test, forcing the heartbeat prompt +// to be updated in the same change. +export const STALE_ACTIVE_RUN_ERROR_PREFIX = + 'Recovered stale active autonomy run' + +// Guards the legacy-block warning so it fires once per (process, runId) instead +// of every dedup tick while a no-owner record sits there. +const warnedLegacyBlockRunIds = new Set() export type AutonomyRunStatus = | 'queued' @@ -59,6 +82,8 @@ export type AutonomyRunRecord = { flowStepName?: string promptPreview: string createdAt: number + ownerProcessId?: number + ownerSessionId?: string startedAt?: number endedAt?: number error?: string @@ -77,6 +102,19 @@ type AutonomyRunFlowRef = { stepName: string } +type CreateAutonomyRunParams = { + trigger: AutonomyTriggerKind + prompt: string + rootDir?: string + currentDir?: string + sourceId?: string + sourceLabel?: string + runtime?: AutonomyRunRuntime + ownerKey?: string + flow?: AutonomyRunFlowRef + nowMs?: number +} + function truncatePromptPreview(prompt: string): string { const singleLine = prompt.replace(/\s+/g, ' ').trim() return singleLine.length <= 240 @@ -95,6 +133,34 @@ function cloneRunRecord(run: AutonomyRunRecord): AutonomyRunRecord { return { ...run } } +function isAutonomyRunActive(run: AutonomyRunRecord): boolean { + return run.status === 'queued' || run.status === 'running' +} + +function selectPersistedAutonomyRuns( + runs: AutonomyRunRecord[], +): AutonomyRunRecord[] { + const cloned = runs.map(cloneRunRecord) + const activeCount = cloned.filter(isAutonomyRunActive).length + if ( + !warnedActiveRunsThresholdCrossed && + activeCount >= AUTONOMY_ACTIVE_RUNS_WARN_THRESHOLD + ) { + warnedActiveRunsThresholdCrossed = true + logError( + new Error( + `autonomy: ${activeCount} active runs exceed warn threshold ${AUTONOMY_ACTIVE_RUNS_WARN_THRESHOLD}; check for finalize leaks`, + ), + ) + } + return retainActiveFirst( + cloned, + isAutonomyRunActive, + run => run.createdAt, + AUTONOMY_RUNS_MAX, + ) +} + function normalizePersistedRunRecord( run: PersistedAutonomyRunRecord, ): AutonomyRunRecord { @@ -157,11 +223,7 @@ async function writeAutonomyRuns( path, `${JSON.stringify( { - runs: runs - .slice() - .map(cloneRunRecord) - .sort((left, right) => right.createdAt - left.createdAt) - .slice(0, AUTONOMY_RUNS_MAX), + runs: selectPersistedAutonomyRuns(runs), } satisfies AutonomyRunsFile, null, 2, @@ -172,7 +234,7 @@ async function writeAutonomyRuns( async function updateAutonomyRun( runId: string, - updater: (current: AutonomyRunRecord) => AutonomyRunRecord, + updater: (current: AutonomyRunRecord) => AutonomyRunRecord | null, rootDir: string = getProjectRoot(), ): Promise { return withAutonomyPersistenceLock(rootDir, async () => { @@ -181,7 +243,11 @@ async function updateAutonomyRun( if (index === -1) { return null } - const updated = cloneRunRecord(updater(cloneRunRecord(runs[index]!))) + const next = updater(cloneRunRecord(runs[index]!)) + if (!next) { + return null + } + const updated = cloneRunRecord(next) runs[index] = updated await writeAutonomyRuns(runs, rootDir) return updated @@ -196,21 +262,112 @@ export async function getAutonomyRunById( return runs.find(run => run.runId === runId) ?? null } -export async function createAutonomyRun(params: { +function isActiveAutonomyRunStatus(status: AutonomyRunStatus): boolean { + return status === 'queued' || status === 'running' +} + +function isValidOwnerProcessId(pid: number | undefined): pid is number { + // Reject non-numeric, negative, zero (Linux: send-to-process-group), and + // non-integer values. A forged record with pid=0 or pid<0 used to be + // treated as live and could permanently block dedup; treating them as + // stale closes that availability hole. + return ( + typeof pid === 'number' && + Number.isInteger(pid) && + pid > 0 && + pid <= 4_194_304 + ) +} + +function isStaleActiveAutonomyRun(run: AutonomyRunRecord): boolean { + if (!isActiveAutonomyRunStatus(run.status)) { + return false + } + if (run.ownerProcessId === undefined) { + return false + } + if (!isValidOwnerProcessId(run.ownerProcessId)) { + return true + } + return !isProcessRunning(run.ownerProcessId) +} + +function staleActiveRunError(run: AutonomyRunRecord): string { + return `${STALE_ACTIVE_RUN_ERROR_PREFIX}: owner process ${run.ownerProcessId} is no longer running.` +} + +function failAutonomyRunRecord( + run: AutonomyRunRecord, + error: string, + nowMs: number, +): AutonomyRunRecord { + return { + ...run, + status: 'failed', + endedAt: nowMs, + error, + } +} + +function recoverStaleActiveAutonomyRun( + run: AutonomyRunRecord, + nowMs: number, +): AutonomyRunRecord { + return failAutonomyRunRecord(run, staleActiveRunError(run), nowMs) +} + +async function syncFailedManagedFlowForRun( + run: AutonomyRunRecord, + rootDir: string, +): Promise { + if (run.parentFlowId && run.parentFlowSyncMode === 'managed') { + await markManagedAutonomyFlowStepFailed({ + flowId: run.parentFlowId, + runId: run.runId, + error: run.error ?? 'Autonomy run failed.', + rootDir, + nowMs: run.endedAt, + }) + } +} + +function matchesActiveAutonomyRunSource( + run: AutonomyRunRecord, + params: { + trigger: AutonomyTriggerKind + sourceId: string + ownerKey?: string + }, +): boolean { + return ( + run.trigger === params.trigger && + run.sourceId === params.sourceId && + (params.ownerKey === undefined || run.ownerKey === params.ownerKey) && + isActiveAutonomyRunStatus(run.status) + ) +} + +export async function hasActiveAutonomyRunForSource(params: { trigger: AutonomyTriggerKind - prompt: string + sourceId: string rootDir?: string - currentDir?: string - sourceId?: string - sourceLabel?: string - runtime?: AutonomyRunRuntime ownerKey?: string - flow?: AutonomyRunFlowRef - nowMs?: number -}): Promise { - const rootDir = resolve(params.rootDir ?? getProjectRoot()) - const currentDir = resolve(params.currentDir ?? rootDir) - const record: AutonomyRunRecord = { +}): Promise { + const runs = await listAutonomyRuns(params.rootDir) + return runs.some( + run => + matchesActiveAutonomyRunSource(run, params) && + !isStaleActiveAutonomyRun(run), + ) +} + +function buildAutonomyRunRecord( + params: CreateAutonomyRunParams, + rootDir: string, + currentDir: string, +): AutonomyRunRecord { + const createdAt = params.nowMs ?? Date.now() + return { runId: randomUUID(), runtime: params.runtime ?? (params.flow ? 'flow_step' : 'automatic'), trigger: params.trigger, @@ -231,13 +388,77 @@ export async function createAutonomyRun(params: { } : {}), promptPreview: truncatePromptPreview(params.prompt), - createdAt: params.nowMs ?? Date.now(), + createdAt, + ownerProcessId: process.pid, + ownerSessionId: getSessionId(), } +} + +async function persistAutonomyRunRecord( + record: AutonomyRunRecord, + rootDir: string, + skipWhenActiveSource: boolean, +): Promise<{ + created: boolean + recoveredStaleRuns: AutonomyRunRecord[] +}> { + let created = false + const recoveredStaleRuns: AutonomyRunRecord[] = [] await withAutonomyPersistenceLock(rootDir, async () => { const runs = await listAutonomyRuns(rootDir) + const sourceId = record.sourceId + if (skipWhenActiveSource && sourceId) { + let hasBlockingActiveRun = false + let staleRecoveriesApplied = false + for (let i = 0; i < runs.length; i++) { + const run = runs[i]! + if ( + !matchesActiveAutonomyRunSource(run, { + trigger: record.trigger, + sourceId, + ownerKey: record.ownerKey, + }) + ) { + continue + } + if (isStaleActiveAutonomyRun(run)) { + const recovered = recoverStaleActiveAutonomyRun(run, record.createdAt) + runs[i] = recovered + recoveredStaleRuns.push(recovered) + staleRecoveriesApplied = true + continue + } + if ( + run.ownerProcessId === undefined && + !warnedLegacyBlockRunIds.has(run.runId) + ) { + warnedLegacyBlockRunIds.add(run.runId) + logError( + new Error( + `[autonomyRuns] blocked by legacy un-owned active run ${run.runId} (createdAt=${run.createdAt}); cancel manually if this is a stale upgrade artifact`, + ), + ) + } + hasBlockingActiveRun = true + } + if (hasBlockingActiveRun) { + if (staleRecoveriesApplied) { + await writeAutonomyRuns(runs, rootDir) + } + return + } + } runs.unshift(record) await writeAutonomyRuns(runs, rootDir) + created = true }) + return { created, recoveredStaleRuns } +} + +async function queueManagedFlowStepRunForRecord( + record: AutonomyRunRecord, + rootDir: string, +): Promise { if ( record.parentFlowId && record.flowStepId && @@ -258,9 +479,47 @@ export async function createAutonomyRun(params: { nowMs: record.createdAt, }) } +} + +async function createAutonomyRunCore( + params: CreateAutonomyRunParams, + skipIfActiveSource: boolean, +): Promise { + const rootDir = resolve(params.rootDir ?? getProjectRoot()) + const currentDir = resolve(params.currentDir ?? rootDir) + const record = buildAutonomyRunRecord(params, rootDir, currentDir) + + const { created, recoveredStaleRuns } = await persistAutonomyRunRecord( + record, + rootDir, + skipIfActiveSource, + ) + for (const recovered of recoveredStaleRuns) { + await syncFailedManagedFlowForRun(recovered, rootDir) + } + if (!created) { + return null + } + await queueManagedFlowStepRunForRecord(record, rootDir) + return record +} + +export async function createAutonomyRun( + params: CreateAutonomyRunParams, +): Promise { + const record = await createAutonomyRunCore(params, false) + if (!record) { + throw new Error('Autonomy run was unexpectedly skipped.') + } return record } +export async function createAutonomyRunIfNoActiveSource( + params: CreateAutonomyRunParams & { sourceId: string }, +): Promise { + return createAutonomyRunCore(params, true) +} + function buildManagedFlowStepPrompt( flow: AutonomyFlowRecord, stepIndex: number, @@ -336,6 +595,7 @@ async function createOrRecoverManagedFlowStepCommand(params: { workload: params.workload, autonomy: { runId: run.runId, + rootDir: run.rootDir, trigger: 'managed-flow-step', sourceId: run.sourceId, sourceLabel: run.sourceLabel, @@ -426,11 +686,16 @@ export async function markAutonomyRunRunning( ): Promise { const updated = await updateAutonomyRun( runId, - current => ({ - ...current, - status: 'running', - startedAt: nowMs ?? Date.now(), - }), + current => + current.status === 'queued' + ? { + ...current, + status: 'running', + startedAt: nowMs ?? Date.now(), + ownerProcessId: process.pid, + ownerSessionId: getSessionId(), + } + : null, rootDir, ) if (updated?.parentFlowId && updated.parentFlowSyncMode === 'managed') { @@ -451,12 +716,15 @@ export async function markAutonomyRunCompleted( ): Promise { const updated = await updateAutonomyRun( runId, - current => ({ - ...current, - status: 'completed', - endedAt: nowMs ?? Date.now(), - error: undefined, - }), + current => + current.status === 'queued' || current.status === 'running' + ? { + ...current, + status: 'completed', + endedAt: nowMs ?? Date.now(), + error: undefined, + } + : null, rootDir, ) if (updated?.parentFlowId && updated.parentFlowSyncMode === 'managed') { @@ -476,24 +744,17 @@ export async function markAutonomyRunFailed( rootDir?: string, nowMs?: number, ): Promise { + const endedAt = nowMs ?? Date.now() const updated = await updateAutonomyRun( runId, - current => ({ - ...current, - status: 'failed', - endedAt: nowMs ?? Date.now(), - error, - }), + current => + isActiveAutonomyRunStatus(current.status) + ? failAutonomyRunRecord(current, error, endedAt) + : null, rootDir, ) - if (updated?.parentFlowId && updated.parentFlowSyncMode === 'managed') { - await markManagedAutonomyFlowStepFailed({ - flowId: updated.parentFlowId, - runId: updated.runId, - error, - rootDir, - nowMs: updated.endedAt, - }) + if (updated) { + await syncFailedManagedFlowForRun(updated, rootDir ?? updated.rootDir) } return updated } @@ -505,12 +766,15 @@ export async function markAutonomyRunCancelled( ): Promise { const updated = await updateAutonomyRun( runId, - current => ({ - ...current, - status: 'cancelled', - endedAt: nowMs ?? Date.now(), - error: undefined, - }), + current => + current.status === 'queued' || current.status === 'running' + ? { + ...current, + status: 'cancelled', + endedAt: nowMs ?? Date.now(), + error: undefined, + } + : null, rootDir, ) if (updated?.parentFlowId && updated.parentFlowSyncMode === 'managed') { @@ -612,6 +876,7 @@ export async function createAutonomyQueuedPrompt(params: { currentDir?: string sourceId?: string sourceLabel?: string + ownerKey?: string workload?: string priority?: 'now' | 'next' | 'later' shouldCreate?: () => boolean @@ -634,39 +899,130 @@ export async function createAutonomyQueuedPrompt(params: { currentDir, sourceId: params.sourceId, sourceLabel: params.sourceLabel, + ownerKey: params.ownerKey, workload: params.workload, priority: params.priority, flow: params.flow, }) } +export async function createAutonomyQueuedPromptIfNoActiveSource(params: { + trigger: AutonomyTriggerKind + basePrompt: string + rootDir?: string + currentDir?: string + sourceId: string + sourceLabel?: string + ownerKey?: string + workload?: string + priority?: 'now' | 'next' | 'later' + shouldCreate?: () => boolean +}): Promise { + const rootDir = resolve(params.rootDir ?? getProjectRoot()) + const currentDir = resolve(params.currentDir ?? getCwd()) + // Cheap optimistic pre-check: skip the AGENTS.md / HEARTBEAT.md disk + // reads + prompt assembly when an active run for this source already + // blocks dedup. The lock-side check inside persistAutonomyRunRecord + // remains authoritative; this only fast-paths the common storm case. + if ( + await hasActiveAutonomyRunForSource({ + trigger: params.trigger, + sourceId: params.sourceId, + rootDir, + ownerKey: params.ownerKey, + }) + ) { + return null + } + const prepared = await prepareAutonomyTurnPrompt({ + basePrompt: params.basePrompt, + trigger: params.trigger, + rootDir, + currentDir, + }) + if (params.shouldCreate && !params.shouldCreate()) { + return null + } + return commitAutonomyQueuedPromptIfNoActiveSource({ + prepared, + rootDir, + currentDir, + sourceId: params.sourceId, + sourceLabel: params.sourceLabel, + ownerKey: params.ownerKey, + workload: params.workload, + priority: params.priority, + }) +} + export async function commitAutonomyQueuedPrompt(params: { prepared: Awaited> rootDir?: string currentDir?: string sourceId?: string sourceLabel?: string + ownerKey?: string workload?: string priority?: 'now' | 'next' | 'later' flow?: AutonomyRunFlowRef }): Promise { + const command = await commitAutonomyQueuedPromptInternal(params, false) + if (!command) { + throw new Error('Autonomy queued prompt was unexpectedly skipped.') + } + return command +} + +async function commitAutonomyQueuedPromptIfNoActiveSource(params: { + prepared: Awaited> + rootDir?: string + currentDir?: string + sourceId: string + sourceLabel?: string + ownerKey?: string + workload?: string + priority?: 'now' | 'next' | 'later' +}): Promise { + return commitAutonomyQueuedPromptInternal(params, true) +} + +async function commitAutonomyQueuedPromptInternal( + params: { + prepared: Awaited> + rootDir?: string + currentDir?: string + sourceId?: string + sourceLabel?: string + ownerKey?: string + workload?: string + priority?: 'now' | 'next' | 'later' + flow?: AutonomyRunFlowRef + }, + skipWhenActiveSource: boolean, +): Promise { const rootDir = resolve( params.rootDir ?? params.prepared.rootDir ?? getProjectRoot(), ) const currentDir = resolve( params.currentDir ?? params.prepared.currentDir ?? getCwd(), ) - commitPreparedAutonomyTurn(params.prepared) const value = params.prepared.prompt - const run = await createAutonomyRun({ + const runParams: CreateAutonomyRunParams = { trigger: params.prepared.trigger, prompt: value, rootDir, currentDir, sourceId: params.sourceId, sourceLabel: params.sourceLabel, + ownerKey: params.ownerKey, flow: params.flow, - }) + } + const useDedup = skipWhenActiveSource && Boolean(params.sourceId) + const run = await createAutonomyRunCore(runParams, useDedup) + if (!run) { + return null + } + commitPreparedAutonomyTurn(params.prepared) const origin = { kind: 'autonomy', trigger: params.prepared.trigger, @@ -683,6 +1039,7 @@ export async function commitAutonomyQueuedPrompt(params: { workload: params.workload, autonomy: { runId: run.runId, + rootDir: run.rootDir, trigger: params.prepared.trigger, sourceId: params.sourceId, sourceLabel: params.sourceLabel, diff --git a/src/utils/handlePromptSubmit.ts b/src/utils/handlePromptSubmit.ts index 97b05758f1..e8c387167b 100644 --- a/src/utils/handlePromptSubmit.ts +++ b/src/utils/handlePromptSubmit.ts @@ -19,19 +19,20 @@ import { } from '../types/textInputTypes.js' import { createAbortController } from './abortController.js' import type { PastedContent } from './config.js' +import { getCwd } from './cwd.js' import { logForDebugging } from './debug.js' import type { EffortValue } from './effort.js' import type { FileHistoryState } from './fileHistory.js' import { fileHistoryEnabled, fileHistoryMakeSnapshot } from './fileHistory.js' import { gracefulShutdownSync } from './gracefulShutdown.js' +import { toError } from './errors.js' +import { logError } from './log.js' import { enqueue } from './messageQueueManager.js' import { resolveSkillModelOverride } from './model/model.js' import { - finalizeAutonomyRunCompleted, - finalizeAutonomyRunFailed, - markAutonomyRunFailed, - markAutonomyRunRunning, -} from './autonomyRuns.js' + claimConsumableQueuedAutonomyCommands, + finalizeAutonomyCommandsForTurn, +} from './autonomyQueueLifecycle.js' import type { ProcessUserInputContext } from './processUserInput/processUserInput.js' import { processUserInput } from './processUserInput/processUserInput.js' import type { QueryGuard } from './QueryGuard.js' @@ -75,7 +76,7 @@ type BaseExecutionParams = { onBeforeQuery?: (input: string, newMessages: Message[]) => Promise, input?: string, effort?: EffortValue, - ) => Promise + ) => Promise setAppState: (updater: (prev: AppState) => AppState) => void onBeforeQuery?: (input: string, newMessages: Message[]) => Promise canUseTool?: CanUseToolFn @@ -459,7 +460,18 @@ async function executeUserInput(params: ExecuteUserInputParams): Promise { // Iterate all commands uniformly. First command gets attachments + // ideSelection + pastedContents, rest skip attachments to avoid // duplicating turn-level context (IDE selection, todos, diffs). - const commands = queuedCommands ?? [] + let commands = queuedCommands ?? [] + const queuedAutonomyClaim = + await claimConsumableQueuedAutonomyCommands(commands) + commands = queuedAutonomyClaim.attachmentCommands + const claimedAutonomyCommands = queuedAutonomyClaim.claimedCommands + if (commands.length === 0) { + // Clear the abort controller published a few lines above so this turn's + // stale controller does not leak into the next turn when every claimed + // autonomy command was skipped as non-consumable. + setAbortController(null) + return + } // Compute the workload tag for this turn. queueProcessor can batch a // cron prompt with a same-tick human prompt; only tag when EVERY @@ -471,7 +483,7 @@ async function executeUserInput(params: ExecuteUserInputParams): Promise { commands.every(c => c.workload === firstWorkload) ? firstWorkload : undefined - let autonomyRunIds: string[] | undefined + const deferredAutonomyRunIds = new Set() // Wrap the entire turn (processUserInput loop + onQuery) in an // AsyncLocalStorage context. This is the ONLY way to correctly @@ -481,15 +493,13 @@ async function executeUserInput(params: ExecuteUserInputParams): Promise { // context — isolated from the parent's continuation. A process-global // mutable slot would be clobbered at the detached closure's first // await by this function's synchronous return path. See state.ts. + let turnError: unknown try { await runWithWorkload(turnWorkload, async () => { for (let i = 0; i < commands.length; i++) { const cmd = commands[i]! const isFirst = i === 0 - if (cmd.autonomy?.runId) { - ;(autonomyRunIds ??= []).push(cmd.autonomy.runId) - await markAutonomyRunRunning(cmd.autonomy.runId) - } + const runId = cmd.autonomy?.runId const result = await processUserInput({ input: cmd.value, preExpansionInput: cmd.preExpansionValue, @@ -510,7 +520,11 @@ async function executeUserInput(params: ExecuteUserInputParams): Promise { bridgeOrigin: cmd.bridgeOrigin, isMeta: cmd.isMeta, skipAttachments: !isFirst, + autonomy: cmd.autonomy, }) + if (runId && result.deferAutonomyCompletion) { + deferredAutonomyRunIds.add(runId) + } // Stamp origin here rather than threading another arg through // processUserInput → processUserInputBase → processTextPrompt → createUserMessage. // Derive origin from mode for task-notifications — mirrors the origin @@ -611,28 +625,52 @@ async function executeUserInput(params: ExecuteUserInputParams): Promise { } } }) // end runWithWorkload — ALS context naturally scoped, no finally needed - if (autonomyRunIds?.length) { - for (const runId of autonomyRunIds) { - const nextCommands = await finalizeAutonomyRunCompleted({ - runId, + } catch (error) { + turnError = error + } + + // Finalize claimed autonomy commands as `completed` only if the turn + // body itself succeeded. Run the finalize call in its own try/catch so a + // failure there does not double-finalize the same commands as `failed` + // (which previously cancelled follow-up queue state after a successful + // turn). + if (claimedAutonomyCommands.length) { + const finalizableCommands = claimedAutonomyCommands.filter(command => { + const runId = command.autonomy?.runId + return !runId || !deferredAutonomyRunIds.has(runId) + }) + if (turnError) { + try { + await finalizeAutonomyCommandsForTurn({ + commands: finalizableCommands, + outcome: { type: 'failed', error: turnError }, + currentDir: getCwd(), + priority: 'later', + workload: turnWorkload, + }) + } catch (finalizeError) { + logError(toError(finalizeError)) + } + } else { + try { + const nextCommands = await finalizeAutonomyCommandsForTurn({ + commands: finalizableCommands, + outcome: { type: 'completed' }, + currentDir: getCwd(), priority: 'later', workload: turnWorkload, }) for (const nextCommand of nextCommands) { enqueue(nextCommand) } + } catch (finalizeError) { + logError(toError(finalizeError)) } } - } catch (error) { - if (autonomyRunIds?.length) { - for (const runId of autonomyRunIds) { - await finalizeAutonomyRunFailed({ - runId, - error: String(error), - }) - } - } - throw error + } + + if (turnError) { + throw turnError } } finally { // Safety net: release the guard reservation if processUserInput threw diff --git a/src/utils/model/__tests__/providers.test.ts b/src/utils/model/__tests__/providers.test.ts index 0ed816f9e1..6790a3e6f9 100644 --- a/src/utils/model/__tests__/providers.test.ts +++ b/src/utils/model/__tests__/providers.test.ts @@ -1,173 +1,162 @@ -import { describe, expect, test, beforeEach, afterEach } from "bun:test"; -import { mock } from "bun:test"; +import { describe, expect, test, beforeEach, afterEach } from 'bun:test' -let mockedModelType: "gemini" | undefined; +const { getAPIProvider, isFirstPartyAnthropicBaseUrl } = await import( + '../providers' +) -mock.module("../../settings/settings.js", () => ({ - getInitialSettings: () => - mockedModelType ? { modelType: mockedModelType } : {}, -})); - -const { getAPIProvider, isFirstPartyAnthropicBaseUrl } = - await import("../providers"); - -describe("getAPIProvider", () => { +describe('getAPIProvider', () => { const envKeys = [ - "CLAUDE_CODE_USE_GEMINI", - "CLAUDE_CODE_USE_BEDROCK", - "CLAUDE_CODE_USE_VERTEX", - "CLAUDE_CODE_USE_FOUNDRY", - "CLAUDE_CODE_USE_OPENAI", - ] as const; - const savedEnv: Record = {}; - + 'CLAUDE_CODE_USE_GEMINI', + 'CLAUDE_CODE_USE_BEDROCK', + 'CLAUDE_CODE_USE_VERTEX', + 'CLAUDE_CODE_USE_FOUNDRY', + 'CLAUDE_CODE_USE_OPENAI', + 'CLAUDE_CODE_USE_GROK', + ] as const + const savedEnv: Record = {} beforeEach(() => { // Save and clear environment variables - mockedModelType = undefined; for (const key of envKeys) { - savedEnv[key] = process.env[key]; - delete process.env[key]; + savedEnv[key] = process.env[key] + delete process.env[key] } - }); + }) afterEach(() => { // Restore environment variables - mockedModelType = undefined; for (const key of envKeys) { if (savedEnv[key] !== undefined) { - process.env[key] = savedEnv[key]; + process.env[key] = savedEnv[key] } else { - delete process.env[key]; + delete process.env[key] } } - }); + }) test('returns "firstParty" by default', () => { - expect(getAPIProvider()).toBe("firstParty"); - }); + expect(getAPIProvider({})).toBe('firstParty') + }) test('returns "gemini" when modelType is gemini', () => { - mockedModelType = "gemini"; - expect(getAPIProvider()).toBe("gemini"); - }); + expect(getAPIProvider({ modelType: 'gemini' })).toBe('gemini') + }) - test("modelType takes precedence over environment variables", () => { - mockedModelType = "gemini"; - process.env.CLAUDE_CODE_USE_BEDROCK = "1"; - expect(getAPIProvider()).toBe("gemini"); - }); + test('modelType takes precedence over environment variables', () => { + process.env.CLAUDE_CODE_USE_BEDROCK = '1' + expect(getAPIProvider({ modelType: 'gemini' })).toBe('gemini') + }) test('returns "gemini" when CLAUDE_CODE_USE_GEMINI is set', () => { - process.env.CLAUDE_CODE_USE_GEMINI = "1"; - expect(getAPIProvider()).toBe("gemini"); - }); + process.env.CLAUDE_CODE_USE_GEMINI = '1' + expect(getAPIProvider({})).toBe('gemini') + }) test('returns "bedrock" when CLAUDE_CODE_USE_BEDROCK is set', () => { - process.env.CLAUDE_CODE_USE_BEDROCK = "1"; - expect(getAPIProvider()).toBe("bedrock"); - }); + process.env.CLAUDE_CODE_USE_BEDROCK = '1' + expect(getAPIProvider({})).toBe('bedrock') + }) test('returns "vertex" when CLAUDE_CODE_USE_VERTEX is set', () => { - process.env.CLAUDE_CODE_USE_VERTEX = "1"; - expect(getAPIProvider()).toBe("vertex"); - }); + process.env.CLAUDE_CODE_USE_VERTEX = '1' + expect(getAPIProvider({})).toBe('vertex') + }) test('returns "foundry" when CLAUDE_CODE_USE_FOUNDRY is set', () => { - process.env.CLAUDE_CODE_USE_FOUNDRY = "1"; - expect(getAPIProvider()).toBe("foundry"); - }); - - test("bedrock takes precedence over gemini", () => { - process.env.CLAUDE_CODE_USE_BEDROCK = "1"; - process.env.CLAUDE_CODE_USE_GEMINI = "1"; - expect(getAPIProvider()).toBe("bedrock"); - }); - - test("bedrock takes precedence over vertex", () => { - process.env.CLAUDE_CODE_USE_BEDROCK = "1"; - process.env.CLAUDE_CODE_USE_VERTEX = "1"; - expect(getAPIProvider()).toBe("bedrock"); - }); - - test("bedrock wins when all three env vars are set", () => { - process.env.CLAUDE_CODE_USE_BEDROCK = "1"; - process.env.CLAUDE_CODE_USE_VERTEX = "1"; - process.env.CLAUDE_CODE_USE_FOUNDRY = "1"; - expect(getAPIProvider()).toBe("bedrock"); - }); + process.env.CLAUDE_CODE_USE_FOUNDRY = '1' + expect(getAPIProvider({})).toBe('foundry') + }) + + test('bedrock takes precedence over gemini', () => { + process.env.CLAUDE_CODE_USE_BEDROCK = '1' + process.env.CLAUDE_CODE_USE_GEMINI = '1' + expect(getAPIProvider({})).toBe('bedrock') + }) + + test('bedrock takes precedence over vertex', () => { + process.env.CLAUDE_CODE_USE_BEDROCK = '1' + process.env.CLAUDE_CODE_USE_VERTEX = '1' + expect(getAPIProvider({})).toBe('bedrock') + }) + + test('bedrock wins when all three env vars are set', () => { + process.env.CLAUDE_CODE_USE_BEDROCK = '1' + process.env.CLAUDE_CODE_USE_VERTEX = '1' + process.env.CLAUDE_CODE_USE_FOUNDRY = '1' + expect(getAPIProvider({})).toBe('bedrock') + }) test('"true" is truthy', () => { - process.env.CLAUDE_CODE_USE_BEDROCK = "true"; - expect(getAPIProvider()).toBe("bedrock"); - }); + process.env.CLAUDE_CODE_USE_BEDROCK = 'true' + expect(getAPIProvider({})).toBe('bedrock') + }) test('"0" is not truthy', () => { - process.env.CLAUDE_CODE_USE_BEDROCK = "0"; - expect(getAPIProvider()).toBe("firstParty"); - }); + process.env.CLAUDE_CODE_USE_BEDROCK = '0' + expect(getAPIProvider({})).toBe('firstParty') + }) test('empty string is not truthy', () => { - process.env.CLAUDE_CODE_USE_BEDROCK = ""; - expect(getAPIProvider()).toBe("firstParty"); - }); -}); + process.env.CLAUDE_CODE_USE_BEDROCK = '' + expect(getAPIProvider({})).toBe('firstParty') + }) +}) -describe("isFirstPartyAnthropicBaseUrl", () => { - const originalBaseUrl = process.env.ANTHROPIC_BASE_URL; - const originalUserType = process.env.USER_TYPE; +describe('isFirstPartyAnthropicBaseUrl', () => { + const originalBaseUrl = process.env.ANTHROPIC_BASE_URL + const originalUserType = process.env.USER_TYPE afterEach(() => { if (originalBaseUrl !== undefined) { - process.env.ANTHROPIC_BASE_URL = originalBaseUrl; + process.env.ANTHROPIC_BASE_URL = originalBaseUrl } else { - delete process.env.ANTHROPIC_BASE_URL; + delete process.env.ANTHROPIC_BASE_URL } if (originalUserType !== undefined) { - process.env.USER_TYPE = originalUserType; + process.env.USER_TYPE = originalUserType } else { - delete process.env.USER_TYPE; + delete process.env.USER_TYPE } - }); - - test("returns true when ANTHROPIC_BASE_URL is not set", () => { - delete process.env.ANTHROPIC_BASE_URL; - expect(isFirstPartyAnthropicBaseUrl()).toBe(true); - }); - - test("returns true for api.anthropic.com", () => { - process.env.ANTHROPIC_BASE_URL = "https://api.anthropic.com"; - expect(isFirstPartyAnthropicBaseUrl()).toBe(true); - }); - - test("returns false for custom URL", () => { - process.env.ANTHROPIC_BASE_URL = "https://my-proxy.com"; - expect(isFirstPartyAnthropicBaseUrl()).toBe(false); - }); - - test("returns false for invalid URL", () => { - process.env.ANTHROPIC_BASE_URL = "not-a-url"; - expect(isFirstPartyAnthropicBaseUrl()).toBe(false); - }); - - test("returns true for staging URL when USER_TYPE is ant", () => { - process.env.ANTHROPIC_BASE_URL = "https://api-staging.anthropic.com"; - process.env.USER_TYPE = "ant"; - expect(isFirstPartyAnthropicBaseUrl()).toBe(true); - }); - - test("returns true for URL with path", () => { - process.env.ANTHROPIC_BASE_URL = "https://api.anthropic.com/v1"; - expect(isFirstPartyAnthropicBaseUrl()).toBe(true); - }); - - test("returns true for trailing slash", () => { - process.env.ANTHROPIC_BASE_URL = "https://api.anthropic.com/"; - expect(isFirstPartyAnthropicBaseUrl()).toBe(true); - }); - - test("returns false for subdomain attack", () => { - process.env.ANTHROPIC_BASE_URL = "https://evil-api.anthropic.com"; - expect(isFirstPartyAnthropicBaseUrl()).toBe(false); - }); -}); + }) + + test('returns true when ANTHROPIC_BASE_URL is not set', () => { + delete process.env.ANTHROPIC_BASE_URL + expect(isFirstPartyAnthropicBaseUrl()).toBe(true) + }) + + test('returns true for api.anthropic.com', () => { + process.env.ANTHROPIC_BASE_URL = 'https://api.anthropic.com' + expect(isFirstPartyAnthropicBaseUrl()).toBe(true) + }) + + test('returns false for custom URL', () => { + process.env.ANTHROPIC_BASE_URL = 'https://my-proxy.com' + expect(isFirstPartyAnthropicBaseUrl()).toBe(false) + }) + + test('returns false for invalid URL', () => { + process.env.ANTHROPIC_BASE_URL = 'not-a-url' + expect(isFirstPartyAnthropicBaseUrl()).toBe(false) + }) + + test('returns true for staging URL when USER_TYPE is ant', () => { + process.env.ANTHROPIC_BASE_URL = 'https://api-staging.anthropic.com' + process.env.USER_TYPE = 'ant' + expect(isFirstPartyAnthropicBaseUrl()).toBe(true) + }) + + test('returns true for URL with path', () => { + process.env.ANTHROPIC_BASE_URL = 'https://api.anthropic.com/v1' + expect(isFirstPartyAnthropicBaseUrl()).toBe(true) + }) + + test('returns true for trailing slash', () => { + process.env.ANTHROPIC_BASE_URL = 'https://api.anthropic.com/' + expect(isFirstPartyAnthropicBaseUrl()).toBe(true) + }) + + test('returns false for subdomain attack', () => { + process.env.ANTHROPIC_BASE_URL = 'https://evil-api.anthropic.com' + expect(isFirstPartyAnthropicBaseUrl()).toBe(false) + }) +}) diff --git a/src/utils/model/providers.ts b/src/utils/model/providers.ts index 79572d42e8..d4784da844 100644 --- a/src/utils/model/providers.ts +++ b/src/utils/model/providers.ts @@ -1,5 +1,6 @@ import type { AnalyticsMetadata_I_VERIFIED_THIS_IS_NOT_CODE_OR_FILEPATHS } from '../../services/analytics/index.js' import { getInitialSettings } from '../settings/settings.js' +import type { SettingsJson } from '../settings/types.js' import { isEnvTruthy } from '../envUtils.js' export type APIProvider = @@ -11,8 +12,10 @@ export type APIProvider = | 'gemini' | 'grok' -export function getAPIProvider(): APIProvider { - const modelType = getInitialSettings().modelType +export function getAPIProvider( + settings: Pick = getInitialSettings(), +): APIProvider { + const modelType = settings.modelType if (modelType === 'openai') return 'openai' if (modelType === 'gemini') return 'gemini' if (modelType === 'grok') return 'grok' diff --git a/src/utils/processUserInput/__tests__/processSlashCommand.test.ts b/src/utils/processUserInput/__tests__/processSlashCommand.test.ts new file mode 100644 index 0000000000..7ba0f3c2b3 --- /dev/null +++ b/src/utils/processUserInput/__tests__/processSlashCommand.test.ts @@ -0,0 +1,375 @@ +import { afterEach, beforeEach, describe, expect, mock, test } from 'bun:test' +import type { QueuedCommand } from '../../../types/textInputTypes' +import { + resetStateForTests, + setCwdState, + setOriginalCwd, + setProjectRoot, +} from '../../../bootstrap/state' +import { + createAutonomyQueuedPrompt, + getAutonomyRunById, + listAutonomyRuns, + markAutonomyRunRunning, +} from '../../autonomyRuns' +import { resetAutonomyAuthorityForTests } from '../../autonomyAuthority' +import { createScheduledTaskQueuedCommand } from '../../../hooks/useScheduledTasks' +import { + cleanupTempDir, + createTempDir, +} from '../../../../tests/mocks/file-system' + +let runAgentBlocker: Promise | null = null +let releaseRunAgentBlocker: (() => void) | null = null +let runAgentStartCount = 0 +let originalNodeEnv: string | undefined +let originalAnthropicApiKey: string | undefined +const commandQueue: QueuedCommand[] = [] + +function enqueue(command: QueuedCommand): void { + commandQueue.push({ ...command, priority: command.priority ?? 'next' }) +} + +function enqueuePendingNotification(command: QueuedCommand): void { + commandQueue.push({ ...command, priority: command.priority ?? 'later' }) +} + +function getCommandQueue(): QueuedCommand[] { + return [...commandQueue] +} + +function hasCommandsInQueue(): boolean { + return commandQueue.length > 0 +} + +function resetCommandQueue(): void { + commandQueue.length = 0 +} + +function createMessageQueueManagerMock() { + return { + enqueue, + enqueuePendingNotification, + getCommandQueue, + hasCommandsInQueue, + resetCommandQueue, + } +} + +function holdRunAgent(): void { + runAgentBlocker = new Promise(resolve => { + releaseRunAgentBlocker = resolve + }) +} + +function releaseRunAgent(): void { + releaseRunAgentBlocker?.() + runAgentBlocker = null + releaseRunAgentBlocker = null +} + +mock.module('bun:bundle', () => ({ + feature: (name: string) => name === 'KAIROS', +})) + +mock.module( + '@claude-code-best/builtin-tools/tools/AgentTool/runAgent.js', + () => ({ + runAgent: async function* () { + runAgentStartCount += 1 + if (runAgentBlocker) { + await runAgentBlocker + } + yield { + type: 'assistant', + uuid: 'assistant-1', + timestamp: new Date().toISOString(), + message: { + id: 'msg_1', + type: 'message', + role: 'assistant', + model: 'test-model', + content: [{ type: 'text', text: 'forked command done' }], + stop_reason: 'end_turn', + stop_sequence: null, + usage: { + input_tokens: 0, + output_tokens: 0, + }, + }, + } + }, + }), +) + +mock.module('@claude-code-best/builtin-tools/tools/AgentTool/UI.js', () => ({ + AgentPromptDisplay: () => null, + AgentResponseDisplay: () => null, + extractLastToolInfo: () => null, + renderGroupedAgentToolUse: () => null, + renderToolResultMessage: () => null, + renderToolUseErrorMessage: () => null, + renderToolUseMessage: () => null, + renderToolUseProgressMessage: () => null, + renderToolUseRejectedMessage: () => null, + renderToolUseTag: () => null, + userFacingName: () => 'Agent', + userFacingNameBackgroundColor: () => 'gray', +})) + +mock.module('../../messageQueueManager', createMessageQueueManagerMock) +mock.module('../../messageQueueManager.js', createMessageQueueManagerMock) + +const { processSlashCommand } = await import('../processSlashCommand') + +let tempDir = '' + +function createScheduledTaskQueuedCommandForTest(task: { + id: string + prompt: string +}) { + return createScheduledTaskQueuedCommand(task, { + rootDir: tempDir, + currentDir: tempDir, + }) +} + +async function waitForRunStatus( + runId: string, + status: 'queued' | 'running' | 'completed' | 'failed' | 'cancelled', +): Promise { + for (let i = 0; i < 200; i++) { + const run = await getAutonomyRunById(runId, tempDir) + if (run?.status === status) { + return + } + await new Promise(resolve => setTimeout(resolve, 10)) + } + const run = await getAutonomyRunById(runId, tempDir) + throw new Error(`Expected ${runId} to be ${status}, got ${run?.status}`) +} + +async function waitForRunAgentStarts(expected: number): Promise { + for (let i = 0; i < 200; i++) { + if (runAgentStartCount >= expected) { + return + } + await new Promise(resolve => setTimeout(resolve, 10)) + } + throw new Error( + `Expected runAgent to start ${expected} time(s), got ${runAgentStartCount}`, + ) +} + +async function waitForCommandQueueLength(expected: number): Promise { + for (let i = 0; i < 200; i++) { + if (getCommandQueue().length === expected) { + return + } + await new Promise(resolve => setTimeout(resolve, 10)) + } + throw new Error( + `Expected command queue length ${expected}, got ${getCommandQueue().length}`, + ) +} + +beforeEach(async () => { + tempDir = await createTempDir('process-slash-command-') + originalNodeEnv = process.env.NODE_ENV + originalAnthropicApiKey = process.env.ANTHROPIC_API_KEY + process.env.NODE_ENV = 'test' + process.env.ANTHROPIC_API_KEY = 'test-key' + runAgentBlocker = null + releaseRunAgentBlocker = null + runAgentStartCount = 0 + resetStateForTests() + resetAutonomyAuthorityForTests() + resetCommandQueue() + setOriginalCwd(tempDir) + setProjectRoot(tempDir) + setCwdState(tempDir) +}) + +afterEach(async () => { + releaseRunAgent() + if (originalNodeEnv === undefined) { + delete process.env.NODE_ENV + } else { + process.env.NODE_ENV = originalNodeEnv + } + if (originalAnthropicApiKey === undefined) { + delete process.env.ANTHROPIC_API_KEY + } else { + process.env.ANTHROPIC_API_KEY = originalAnthropicApiKey + } + resetStateForTests() + resetAutonomyAuthorityForTests() + resetCommandQueue() + if (tempDir) { + await cleanupTempDir(tempDir) + } + mock.restore() +}) + +describe('processSlashCommand', () => { + const forkedCommand = { + type: 'prompt', + name: 'forked', + description: 'test forked command', + progressMessage: 'forking', + contentLength: 0, + source: 'builtin', + context: 'fork', + getPromptForCommand: async () => [ + { type: 'text', text: 'review from fork' }, + ], + } as const + + function createContext() { + return { + getAppState: () => ({ + kairosEnabled: true, + mcp: { clients: [] }, + toolPermissionContext: { + mode: 'default', + alwaysAllowRules: {}, + }, + }), + options: { + commands: [forkedCommand], + allowBackgroundForkedSlashCommands: true, + tools: [], + refreshTools: () => [], + agentDefinitions: { + activeAgents: [{ agentType: 'general-purpose' }], + }, + }, + setResponseLength: mock((_updater: (length: number) => number) => {}), + } as any + } + + test('defers autonomy completion until a KAIROS background forked command completes', async () => { + const queued = await createAutonomyQueuedPrompt({ + basePrompt: '/forked review', + trigger: 'scheduled-task', + rootDir: tempDir, + currentDir: tempDir, + sourceId: 'cron-1', + }) + expect(queued).not.toBeNull() + const runId = queued!.autonomy!.runId + await markAutonomyRunRunning(runId, tempDir, 100) + + const result = await processSlashCommand( + '/forked review', + [], + [], + [], + createContext(), + mock(() => {}), + undefined, + false, + async () => ({ behavior: 'allow', updatedInput: {} }) as any, + queued!.autonomy, + ) + + expect(result).toMatchObject({ + messages: [], + shouldQuery: false, + deferAutonomyCompletion: true, + }) + + await waitForRunStatus(runId, 'completed') + await waitForCommandQueueLength(1) + expect(getCommandQueue()).toEqual([ + expect.objectContaining({ + mode: 'prompt', + isMeta: true, + skipSlashCommands: true, + value: expect.stringContaining( + '', + ), + }), + ]) + }) + + test('keeps repeated /loop scheduled fires bounded while a background fork is running', async () => { + const task = { + id: 'cron-loop', + prompt: '/forked review', + } + const first = await createScheduledTaskQueuedCommandForTest(task) + expect(first?.autonomy?.runId).toBeDefined() + const runId = first!.autonomy!.runId + await markAutonomyRunRunning(runId, tempDir, 100) + + holdRunAgent() + const result = await processSlashCommand( + '/forked review', + [], + [], + [], + createContext(), + mock(() => {}), + undefined, + false, + async () => ({ behavior: 'allow', updatedInput: {} }) as any, + first!.autonomy, + ) + + expect(result.deferAutonomyCompletion).toBe(true) + await waitForRunAgentStarts(1) + + const repeatedFires = await Promise.all( + Array.from({ length: 200 }, () => + createScheduledTaskQueuedCommandForTest(task), + ), + ) + expect(repeatedFires.every(command => command === null)).toBe(true) + expect( + (await listAutonomyRuns(tempDir)).filter( + run => run.sourceId === 'cron-loop', + ), + ).toHaveLength(1) + expect(getCommandQueue()).toHaveLength(0) + + releaseRunAgent() + await waitForRunStatus(runId, 'completed') + await waitForCommandQueueLength(1) + expect(getCommandQueue()).toHaveLength(1) + + const next = await createScheduledTaskQueuedCommandForTest(task) + expect(next?.autonomy?.runId).toBeDefined() + expect( + (await listAutonomyRuns(tempDir)).filter( + run => run.sourceId === 'cron-loop', + ), + ).toHaveLength(2) + }) + + test('rejects the background fork test override outside test runtime', async () => { + process.env.NODE_ENV = 'production' + + const result = await processSlashCommand( + '/forked review', + [], + [], + [], + createContext(), + mock(() => {}), + undefined, + false, + async () => ({ behavior: 'allow', updatedInput: {} }) as any, + ) + + expect(result.shouldQuery).toBe(false) + expect( + result.messages.some(message => + JSON.stringify(message).includes( + 'allowBackgroundForkedSlashCommands is test-only', + ), + ), + ).toBe(true) + expect(runAgentStartCount).toBe(0) + }) +}) diff --git a/src/utils/processUserInput/processSlashCommand.tsx b/src/utils/processUserInput/processSlashCommand.tsx index 6ee4bfe93b..da6763f7e6 100644 --- a/src/utils/processUserInput/processSlashCommand.tsx +++ b/src/utils/processUserInput/processSlashCommand.tsx @@ -1,10 +1,7 @@ -import { feature } from 'bun:bundle' -import type { - ContentBlockParam, - TextBlockParam, -} from '@anthropic-ai/sdk/resources' -import { randomUUID } from 'crypto' -import { setPromptId } from 'src/bootstrap/state.js' +import { feature } from 'bun:bundle'; +import type { ContentBlockParam, TextBlockParam } from '@anthropic-ai/sdk/resources'; +import { randomUUID } from 'crypto'; +import { setPromptId } from 'src/bootstrap/state.js'; import { builtInCommandNames, type Command, @@ -14,9 +11,9 @@ import { getCommandName, hasCommand, type PromptCommand, -} from 'src/commands.js' -import { NO_CONTENT_MESSAGE } from 'src/constants/messages.js' -import type { SetToolJSXFn, ToolUseContext } from 'src/Tool.js' +} from 'src/commands.js'; +import { NO_CONTENT_MESSAGE } from 'src/constants/messages.js'; +import type { SetToolJSXFn, ToolUseContext } from 'src/Tool.js'; import type { AssistantMessage, AttachmentMessage, @@ -24,42 +21,37 @@ import type { NormalizedUserMessage, ProgressMessage, UserMessage, -} from 'src/types/message.js' -import { addInvokedSkill, getSessionId } from '../../bootstrap/state.js' -import { COMMAND_MESSAGE_TAG, COMMAND_NAME_TAG } from '../../constants/xml.js' -import type { CanUseToolFn } from '../../hooks/useCanUseTool.js' +} from 'src/types/message.js'; +import type { QueuedCommand } from 'src/types/textInputTypes.js'; +import { addInvokedSkill, getSessionId } from '../../bootstrap/state.js'; +import { COMMAND_MESSAGE_TAG, COMMAND_NAME_TAG } from '../../constants/xml.js'; +import type { CanUseToolFn } from '../../hooks/useCanUseTool.js'; import { type AnalyticsMetadata_I_VERIFIED_THIS_IS_NOT_CODE_OR_FILEPATHS, type AnalyticsMetadata_I_VERIFIED_THIS_IS_PII_TAGGED, logEvent, -} from '../../services/analytics/index.js' -import { getDumpPromptsPath } from '../../services/api/dumpPrompts.js' -import { buildPostCompactMessages } from '../../services/compact/compact.js' -import { resetMicrocompactState } from '../../services/compact/microCompact.js' -import type { Progress as AgentProgress } from '@claude-code-best/builtin-tools/tools/AgentTool/AgentTool.js' -import { runAgent } from '@claude-code-best/builtin-tools/tools/AgentTool/runAgent.js' -import { renderToolUseProgressMessage } from '@claude-code-best/builtin-tools/tools/AgentTool/UI.js' -import type { CommandResultDisplay } from '../../types/command.js' -import { createAbortController } from '../abortController.js' -import { getAgentContext } from '../agentContext.js' -import { - createAttachmentMessage, - getAttachmentMessages, -} from '../attachments.js' -import { logForDebugging } from '../debug.js' -import { isEnvTruthy } from '../envUtils.js' -import { AbortError, MalformedCommandError } from '../errors.js' -import { getDisplayPath } from '../file.js' -import { - extractResultText, - prepareForkedCommandContext, -} from '../forkedAgent.js' -import { getFsImplementation } from '../fsOperations.js' -import { isFullscreenEnvEnabled } from '../fullscreen.js' -import { toArray } from '../generators.js' -import { registerSkillHooks } from '../hooks/registerSkillHooks.js' -import { logError } from '../log.js' -import { enqueuePendingNotification } from '../messageQueueManager.js' +} from '../../services/analytics/index.js'; +import { getDumpPromptsPath } from '../../services/api/dumpPrompts.js'; +import { buildPostCompactMessages } from '../../services/compact/compact.js'; +import { resetMicrocompactState } from '../../services/compact/microCompact.js'; +import type { Progress as AgentProgress } from '@claude-code-best/builtin-tools/tools/AgentTool/AgentTool.js'; +import { runAgent } from '@claude-code-best/builtin-tools/tools/AgentTool/runAgent.js'; +import { renderToolUseProgressMessage } from '@claude-code-best/builtin-tools/tools/AgentTool/UI.js'; +import type { CommandResultDisplay } from '../../types/command.js'; +import { createAbortController } from '../abortController.js'; +import { getAgentContext } from '../agentContext.js'; +import { createAttachmentMessage, getAttachmentMessages } from '../attachments.js'; +import { logForDebugging } from '../debug.js'; +import { isEnvTruthy } from '../envUtils.js'; +import { AbortError, MalformedCommandError } from '../errors.js'; +import { getDisplayPath } from '../file.js'; +import { extractResultText, prepareForkedCommandContext } from '../forkedAgent.js'; +import { getFsImplementation } from '../fsOperations.js'; +import { isFullscreenEnvEnabled } from '../fullscreen.js'; +import { toArray } from '../generators.js'; +import { registerSkillHooks } from '../hooks/registerSkillHooks.js'; +import { logError } from '../log.js'; +import { enqueue, enqueuePendingNotification } from '../messageQueueManager.js'; import { createCommandInputMessage, createSyntheticUserCaveatMessage, @@ -71,40 +63,44 @@ import { isSystemLocalCommandMessage, normalizeMessages, prepareUserContent, -} from '../messages.js' -import type { ModelAlias } from '../model/aliases.js' -import { parseToolListFromCLI } from '../permissions/permissionSetup.js' -import { hasPermissionsToUseTool } from '../permissions/permissions.js' -import { - isOfficialMarketplaceName, - parsePluginIdentifier, -} from '../plugins/pluginIdentifier.js' -import { - isRestrictedToPluginOnly, - isSourceAdminTrusted, -} from '../settings/pluginOnlyPolicy.js' -import { parseSlashCommand } from '../slashCommandParsing.js' -import { sleep } from '../sleep.js' -import { recordSkillUsage } from '../suggestions/skillUsageTracking.js' -import { logOTelEvent, redactIfDisabled } from '../telemetry/events.js' -import { buildPluginCommandTelemetryFields } from '../telemetry/pluginTelemetry.js' -import { getAssistantMessageContentLength } from '../tokens.js' -import { createAgentId } from '../uuid.js' -import { getWorkload } from '../workloadContext.js' -import type { - ProcessUserInputBaseResult, - ProcessUserInputContext, -} from './processUserInput.js' +} from '../messages.js'; +import type { ModelAlias } from '../model/aliases.js'; +import { parseToolListFromCLI } from '../permissions/permissionSetup.js'; +import { hasPermissionsToUseTool } from '../permissions/permissions.js'; +import { isOfficialMarketplaceName, parsePluginIdentifier } from '../plugins/pluginIdentifier.js'; +import { isRestrictedToPluginOnly, isSourceAdminTrusted } from '../settings/pluginOnlyPolicy.js'; +import { parseSlashCommand } from '../slashCommandParsing.js'; +import { sleep } from '../sleep.js'; +import { recordSkillUsage } from '../suggestions/skillUsageTracking.js'; +import { logOTelEvent, redactIfDisabled } from '../telemetry/events.js'; +import { buildPluginCommandTelemetryFields } from '../telemetry/pluginTelemetry.js'; +import { getAssistantMessageContentLength } from '../tokens.js'; +import { createAgentId } from '../uuid.js'; +import { finalizeAutonomyRunCompleted, finalizeAutonomyRunFailed } from '../autonomyRuns.js'; +import { getWorkload } from '../workloadContext.js'; +import type { ProcessUserInputBaseResult, ProcessUserInputContext } from './processUserInput.js'; type SlashCommandResult = ProcessUserInputBaseResult & { - command: Command -} + command: Command; +}; // Poll interval and deadline for MCP settle before launching a background // forked subagent. MCP servers typically connect within 1-3s of startup; // 10s headroom covers slow SSE handshakes. -const MCP_SETTLE_POLL_MS = 200 -const MCP_SETTLE_TIMEOUT_MS = 10_000 +const MCP_SETTLE_POLL_MS = 200; +const MCP_SETTLE_TIMEOUT_MS = 10_000; + +function isTestRuntime(): boolean { + return process.env.NODE_ENV === 'test'; +} + +function assertBackgroundForkedSlashCommandTestOverrideAllowed(): void { + if (!isTestRuntime()) { + throw new Error( + 'ToolUseContext.options.allowBackgroundForkedSlashCommands is test-only and cannot be enabled outside NODE_ENV=test.', + ); + } +} /** * Executes a slash command with context: fork in a sub-agent. @@ -116,40 +112,35 @@ async function executeForkedSlashCommand( precedingInputBlocks: ContentBlockParam[], setToolJSX: SetToolJSXFn, canUseTool: CanUseToolFn, + autonomy?: QueuedCommand['autonomy'], ): Promise { - const agentId = createAgentId() + const agentId = createAgentId(); const pluginMarketplace = command.pluginInfo ? parsePluginIdentifier(command.pluginInfo.repository).marketplace - : undefined + : undefined; logEvent('tengu_slash_command_forked', { - command_name: - command.name as AnalyticsMetadata_I_VERIFIED_THIS_IS_NOT_CODE_OR_FILEPATHS, - invocation_trigger: - 'user-slash' as AnalyticsMetadata_I_VERIFIED_THIS_IS_NOT_CODE_OR_FILEPATHS, + command_name: command.name as AnalyticsMetadata_I_VERIFIED_THIS_IS_NOT_CODE_OR_FILEPATHS, + invocation_trigger: 'user-slash' as AnalyticsMetadata_I_VERIFIED_THIS_IS_NOT_CODE_OR_FILEPATHS, ...(command.pluginInfo && { - _PROTO_plugin_name: command.pluginInfo.pluginManifest - .name as AnalyticsMetadata_I_VERIFIED_THIS_IS_PII_TAGGED, + _PROTO_plugin_name: command.pluginInfo.pluginManifest.name as AnalyticsMetadata_I_VERIFIED_THIS_IS_PII_TAGGED, ...(pluginMarketplace && { - _PROTO_marketplace_name: - pluginMarketplace as AnalyticsMetadata_I_VERIFIED_THIS_IS_PII_TAGGED, + _PROTO_marketplace_name: pluginMarketplace as AnalyticsMetadata_I_VERIFIED_THIS_IS_PII_TAGGED, }), ...buildPluginCommandTelemetryFields(command.pluginInfo), }), - }) + }); - const { skillContent, modifiedGetAppState, baseAgent, promptMessages } = - await prepareForkedCommandContext(command, args, context) + const { skillContent, modifiedGetAppState, baseAgent, promptMessages } = await prepareForkedCommandContext( + command, + args, + context, + ); // Merge skill's effort into the agent definition so runAgent applies it - const agentDefinition = - command.effort !== undefined - ? { ...baseAgent, effort: command.effort } - : baseAgent + const agentDefinition = command.effort !== undefined ? { ...baseAgent, effort: command.effort } : baseAgent; - logForDebugging( - `Executing forked slash command /${command.name} with agent ${agentDefinition.agentType}`, - ) + logForDebugging(`Executing forked slash command /${command.name} with agent ${agentDefinition.agentType}`); // Assistant mode: fire-and-forget. Launch subagent in background, return // immediately, re-enqueue the result as an isMeta prompt when done. @@ -163,12 +154,25 @@ async function executeForkedSlashCommand( // isMeta prompts are hidden. Outside assistant mode, context:fork commands // are user-invoked skills (/commit etc.) that should run synchronously // with the progress UI. - if (feature('KAIROS') && (await context.getAppState()).kairosEnabled) { + const appState = await context.getAppState(); + const allowBackgroundForkedSlashCommands = context.options.allowBackgroundForkedSlashCommands === true; + if (allowBackgroundForkedSlashCommands) { + assertBackgroundForkedSlashCommandTestOverrideAllowed(); + } + let canRunBackgroundForkedSlashCommand = false; + if (appState.kairosEnabled) { + if (feature('KAIROS')) { + canRunBackgroundForkedSlashCommand = true; + } else if (allowBackgroundForkedSlashCommands) { + canRunBackgroundForkedSlashCommand = true; + } + } + if (canRunBackgroundForkedSlashCommand) { // Standalone abortController — background subagents survive main-thread // ESC (same policy as AgentTool's async path). They're cron-driven; if // killed mid-run they just re-fire on the next schedule. - const bgAbortController = createAbortController() - const commandName = getCommandName(command) + const bgAbortController = createAbortController(); + const commandName = getCommandName(command); // Workload: handlePromptSubmit wraps the entire turn in runWithWorkload // (AsyncLocalStorage). ALS context is captured when this `void` fires @@ -179,7 +183,7 @@ async function executeForkedSlashCommand( // handlePromptSubmit → fresh runWithWorkload boundary (which always // establishes a new context, even for `undefined`) → so it needs its // own QueuedCommand.workload tag to preserve attribution. - const spawnTimeWorkload = getWorkload() + const spawnTimeWorkload = getWorkload(); // Re-enter the queue as a hidden prompt. isMeta: hides from queue // preview + placeholder + transcript. skipSlashCommands: prevents @@ -195,7 +199,31 @@ async function executeForkedSlashCommand( isMeta: true, skipSlashCommands: true, workload: spawnTimeWorkload, - }) + }); + const finalizeDeferredAutonomyRunCompleted = async (): Promise => { + if (!autonomy?.runId) { + return; + } + const nextCommands = await finalizeAutonomyRunCompleted({ + runId: autonomy.runId, + rootDir: autonomy.rootDir, + priority: 'later', + workload: spawnTimeWorkload, + }); + for (const nextCommand of nextCommands) { + enqueue(nextCommand); + } + }; + const finalizeDeferredAutonomyRunFailed = async (error: unknown): Promise => { + if (!autonomy?.runId) { + return; + } + await finalizeAutonomyRunFailed({ + runId: autonomy.runId, + rootDir: autonomy.rootDir, + error: error instanceof Error ? error.message : String(error), + }); + }; void (async () => { // Wait for MCP servers to settle. Scheduled tasks fire at startup and @@ -204,16 +232,15 @@ async function executeForkedSlashCommand( // accidentally avoided this — tasks serialized, so task N's drain // happened after task N-1's 30s run, by which time MCP was up. // Poll until no 'pending' clients remain, then refresh. - const deadline = Date.now() + MCP_SETTLE_TIMEOUT_MS + const deadline = Date.now() + MCP_SETTLE_TIMEOUT_MS; while (Date.now() < deadline) { - const s = context.getAppState() - if (!s.mcp.clients.some(c => c.type === 'pending')) break - await sleep(MCP_SETTLE_POLL_MS) + const s = context.getAppState(); + if (!s.mcp.clients.some(c => c.type === 'pending')) break; + await sleep(MCP_SETTLE_POLL_MS); } - const freshTools = - context.options.refreshTools?.() ?? context.options.tools + const freshTools = context.options.refreshTools?.() ?? context.options.tools; - const agentMessages: Message[] = [] + const agentMessages: Message[] = []; for await (const message of runAgent({ agentDefinition, promptMessages, @@ -229,40 +256,53 @@ async function executeForkedSlashCommand( availableTools: freshTools, override: { agentId }, })) { - agentMessages.push(message) + agentMessages.push(message); } - const resultText = extractResultText(agentMessages, 'Command completed') - logForDebugging( - `Background forked command /${commandName} completed (agent ${agentId})`, - ) - enqueueResult( - `\n${resultText}\n`, - ) - })().catch(err => { - logError(err) + const resultText = extractResultText(agentMessages, 'Command completed'); + logForDebugging(`Background forked command /${commandName} completed (agent ${agentId})`); + // Enqueue the worker's result before finalizing the autonomy run so the + // notification is observed before any follow-up + // autonomy commands the finalizer enqueues at the same priority. Without + // this ordering, both land at `priority: 'later'` and the next autonomy + // step can run before the main thread sees this worker's output. + enqueueResult(`\n${resultText}\n`); + // The slash command itself succeeded; an error from the finalize call + // must not surface as a contradictory + // via the outer catch below. Log it locally and stop. + try { + await finalizeDeferredAutonomyRunCompleted(); + } catch (finalizeError) { + logError(finalizeError); + } + })().catch(async err => { + logError(err); enqueueResult( `\n${err instanceof Error ? err.message : String(err)}\n`, - ) - }) + ); + await finalizeDeferredAutonomyRunFailed(err); + }); // Nothing to render, nothing to query — the background runner re-enters // the queue on its own schedule. - return { messages: [], shouldQuery: false, command } + return { + messages: [], + shouldQuery: false, + command, + deferAutonomyCompletion: Boolean(autonomy?.runId), + }; } // Collect messages from the forked agent - const agentMessages: Message[] = [] + const agentMessages: Message[] = []; // Build progress messages for the agent progress UI - const progressMessages: ProgressMessage[] = [] - const parentToolUseID = `forked-command-${command.name}` - let toolUseCounter = 0 + const progressMessages: ProgressMessage[] = []; + const parentToolUseID = `forked-command-${command.name}`; + let toolUseCounter = 0; // Helper to create a progress message from an agent message - const createProgressMessage = ( - message: AssistantMessage | NormalizedUserMessage, - ): ProgressMessage => { - toolUseCounter++ + const createProgressMessage = (message: AssistantMessage | NormalizedUserMessage): ProgressMessage => { + toolUseCounter++; return { type: 'progress', data: { @@ -275,8 +315,8 @@ async function executeForkedSlashCommand( toolUseID: `${parentToolUseID}-${toolUseCounter}`, timestamp: new Date().toISOString(), uuid: randomUUID(), - } - } + }; + }; // Helper to update progress display using agent progress UI const updateProgress = (): void => { @@ -288,11 +328,11 @@ async function executeForkedSlashCommand( shouldHidePromptInput: false, shouldContinueAnimation: true, showSpinner: true, - }) - } + }); + }; // Show initial "Initializing…" state - updateProgress() + updateProgress(); // Run the sub-agent try { @@ -309,47 +349,45 @@ async function executeForkedSlashCommand( model: command.model as ModelAlias | undefined, availableTools: context.options.tools, })) { - agentMessages.push(message) - const normalizedNew = normalizeMessages([message]) + agentMessages.push(message); + const normalizedNew = normalizeMessages([message]); // Add progress message for assistant messages (which contain tool uses) if (message.type === 'assistant') { // Increment token count in spinner for assistant messages - const contentLength = getAssistantMessageContentLength(message as AssistantMessage) + const contentLength = getAssistantMessageContentLength(message as AssistantMessage); if (contentLength > 0) { - context.setResponseLength(len => len + contentLength) + context.setResponseLength(len => len + contentLength); } - const normalizedMsg = normalizedNew[0] + const normalizedMsg = normalizedNew[0]; if (normalizedMsg && normalizedMsg.type === 'assistant') { - progressMessages.push(createProgressMessage(message as AssistantMessage)) - updateProgress() + progressMessages.push(createProgressMessage(message as AssistantMessage)); + updateProgress(); } } // Add progress message for user messages (which contain tool results) if (message.type === 'user') { - const normalizedMsg = normalizedNew[0] + const normalizedMsg = normalizedNew[0]; if (normalizedMsg && normalizedMsg.type === 'user') { - progressMessages.push(createProgressMessage(normalizedMsg as AssistantMessage)) - updateProgress() + progressMessages.push(createProgressMessage(normalizedMsg as AssistantMessage)); + updateProgress(); } } } } finally { // Clear the progress display - setToolJSX(null) + setToolJSX(null); } - let resultText = extractResultText(agentMessages, 'Command completed') + let resultText = extractResultText(agentMessages, 'Command completed'); - logForDebugging( - `Forked slash command /${command.name} completed with agent ${agentId}`, - ) + logForDebugging(`Forked slash command /${command.name} completed with agent ${agentId}`); // Prepend debug log for ant users so it appears inside the command output if (process.env.USER_TYPE === 'ant') { - resultText = `[ANT-ONLY] API calls: ${getDisplayPath(getDumpPromptsPath(agentId))}\n${resultText}` + resultText = `[ANT-ONLY] API calls: ${getDisplayPath(getDumpPromptsPath(agentId))}\n${resultText}`; } // Return the result as a user message (simulates the agent's output) @@ -363,14 +401,14 @@ async function executeForkedSlashCommand( createUserMessage({ content: `\n${resultText}\n`, }), - ] + ]; return { messages, shouldQuery: false, command, resultText, - } + }; } /** @@ -383,7 +421,7 @@ async function executeForkedSlashCommand( export function looksLikeCommand(commandName: string): boolean { // Command names should only contain [a-zA-Z0-9:_-] // If it contains other characters, it's probably a file path or other input - return !/[^a-zA-Z0-9:\-_]/.test(commandName) + return !/[^a-zA-Z0-9:\-_]/.test(commandName); } export async function processSlashCommand( @@ -396,11 +434,12 @@ export async function processSlashCommand( uuid?: string, isAlreadyProcessing?: boolean, canUseTool?: CanUseToolFn, + autonomy?: QueuedCommand['autonomy'], ): Promise { - const parsed = parseSlashCommand(inputString) + const parsed = parseSlashCommand(inputString); if (!parsed) { - logEvent('tengu_input_slash_missing', {}) - const errorMessage = 'Commands are in the form `/command [args]`' + logEvent('tengu_input_slash_missing', {}); + const errorMessage = 'Commands are in the form `/command [args]`'; return { messages: [ createSyntheticUserCaveatMessage(), @@ -414,35 +453,30 @@ export async function processSlashCommand( ], shouldQuery: false, resultText: errorMessage, - } + }; } - const { commandName, args: parsedArgs, isMcp } = parsed + const { commandName, args: parsedArgs, isMcp } = parsed; - const sanitizedCommandName = isMcp - ? 'mcp' - : !builtInCommandNames().has(commandName) - ? 'custom' - : commandName + const sanitizedCommandName = isMcp ? 'mcp' : !builtInCommandNames().has(commandName) ? 'custom' : commandName; // Check if it's a real command before processing if (!hasCommand(commandName, context.options.commands)) { // Check if this looks like a command name vs a file path or other input // Also check if it's an actual file path that exists - let isFilePath = false + let isFilePath = false; try { - await getFsImplementation().stat(`/${commandName}`) - isFilePath = true + await getFsImplementation().stat(`/${commandName}`); + isFilePath = true; } catch { // Not a file path — treat as command name } if (looksLikeCommand(commandName) && !isFilePath) { logEvent('tengu_input_slash_invalid', { - input: - commandName as AnalyticsMetadata_I_VERIFIED_THIS_IS_NOT_CODE_OR_FILEPATHS, - }) + input: commandName as AnalyticsMetadata_I_VERIFIED_THIS_IS_NOT_CODE_OR_FILEPATHS, + }); - const unknownMessage = `Unknown skill: ${commandName}` + const unknownMessage = `Unknown skill: ${commandName}`; return { messages: [ createSyntheticUserCaveatMessage(), @@ -455,29 +489,22 @@ export async function processSlashCommand( }), // gh-32591: preserve args so the user can copy/resubmit without // retyping. System warning is UI-only (filtered before API). - ...(parsedArgs - ? [ - createSystemMessage( - `Args from unknown skill: ${parsedArgs}`, - 'warning', - ), - ] - : []), + ...(parsedArgs ? [createSystemMessage(`Args from unknown skill: ${parsedArgs}`, 'warning')] : []), ], shouldQuery: false, resultText: unknownMessage, - } + }; } - const promptId = randomUUID() - setPromptId(promptId) - logEvent('tengu_input_prompt', {}) + const promptId = randomUUID(); + setPromptId(promptId); + logEvent('tengu_input_prompt', {}); // Log user prompt event for OTLP void logOTelEvent('user_prompt', { prompt_length: String(inputString.length), prompt: redactIfDisabled(inputString), 'prompt.id': promptId, - }) + }); return { messages: [ createUserMessage({ @@ -487,7 +514,7 @@ export async function processSlashCommand( ...attachmentMessages, ], shouldQuery: true, - } + }; } // Track slash command usage for feature discovery @@ -502,6 +529,7 @@ export async function processSlashCommand( resultText, nextInput, submitNextInput, + deferAutonomyCompletion, } = await getMessagesForSlashCommand( commandName, parsedArgs, @@ -512,66 +540,55 @@ export async function processSlashCommand( isAlreadyProcessing, canUseTool, uuid, - ) + autonomy, + ); // Local slash commands that skip messages if (newMessages.length === 0) { const eventData: Record = { - input: - sanitizedCommandName as AnalyticsMetadata_I_VERIFIED_THIS_IS_NOT_CODE_OR_FILEPATHS, - } + input: sanitizedCommandName as AnalyticsMetadata_I_VERIFIED_THIS_IS_NOT_CODE_OR_FILEPATHS, + }; // Add plugin metadata if this is a plugin command if (returnedCommand.type === 'prompt' && returnedCommand.pluginInfo) { - const { pluginManifest, repository } = returnedCommand.pluginInfo - const { marketplace } = parsePluginIdentifier(repository) - const isOfficial = isOfficialMarketplaceName(marketplace) + const { pluginManifest, repository } = returnedCommand.pluginInfo; + const { marketplace } = parsePluginIdentifier(repository); + const isOfficial = isOfficialMarketplaceName(marketplace); // _PROTO_* routes to PII-tagged plugin_name/marketplace_name BQ columns // (unredacted, all users); plugin_name/plugin_repository stay in // additional_metadata as redacted variants for general-access dashboards. - eventData._PROTO_plugin_name = - pluginManifest.name as AnalyticsMetadata_I_VERIFIED_THIS_IS_PII_TAGGED + eventData._PROTO_plugin_name = pluginManifest.name as AnalyticsMetadata_I_VERIFIED_THIS_IS_PII_TAGGED; if (marketplace) { - eventData._PROTO_marketplace_name = - marketplace as AnalyticsMetadata_I_VERIFIED_THIS_IS_PII_TAGGED + eventData._PROTO_marketplace_name = marketplace as AnalyticsMetadata_I_VERIFIED_THIS_IS_PII_TAGGED; } eventData.plugin_repository = ( isOfficial ? repository : 'third-party' - ) as AnalyticsMetadata_I_VERIFIED_THIS_IS_NOT_CODE_OR_FILEPATHS + ) as AnalyticsMetadata_I_VERIFIED_THIS_IS_NOT_CODE_OR_FILEPATHS; eventData.plugin_name = ( isOfficial ? pluginManifest.name : 'third-party' - ) as AnalyticsMetadata_I_VERIFIED_THIS_IS_NOT_CODE_OR_FILEPATHS + ) as AnalyticsMetadata_I_VERIFIED_THIS_IS_NOT_CODE_OR_FILEPATHS; if (isOfficial && pluginManifest.version) { - eventData.plugin_version = - pluginManifest.version as AnalyticsMetadata_I_VERIFIED_THIS_IS_NOT_CODE_OR_FILEPATHS + eventData.plugin_version = pluginManifest.version as AnalyticsMetadata_I_VERIFIED_THIS_IS_NOT_CODE_OR_FILEPATHS; } - Object.assign( - eventData, - buildPluginCommandTelemetryFields(returnedCommand.pluginInfo), - ) + Object.assign(eventData, buildPluginCommandTelemetryFields(returnedCommand.pluginInfo)); } logEvent('tengu_input_command', { ...eventData, - invocation_trigger: - 'user-slash' as AnalyticsMetadata_I_VERIFIED_THIS_IS_NOT_CODE_OR_FILEPATHS, + invocation_trigger: 'user-slash' as AnalyticsMetadata_I_VERIFIED_THIS_IS_NOT_CODE_OR_FILEPATHS, ...(process.env.USER_TYPE === 'ant' && { - skill_name: - commandName as AnalyticsMetadata_I_VERIFIED_THIS_IS_NOT_CODE_OR_FILEPATHS, + skill_name: commandName as AnalyticsMetadata_I_VERIFIED_THIS_IS_NOT_CODE_OR_FILEPATHS, ...(returnedCommand.type === 'prompt' && { - skill_source: - returnedCommand.source as AnalyticsMetadata_I_VERIFIED_THIS_IS_NOT_CODE_OR_FILEPATHS, + skill_source: returnedCommand.source as AnalyticsMetadata_I_VERIFIED_THIS_IS_NOT_CODE_OR_FILEPATHS, }), ...(returnedCommand.loadedFrom && { - skill_loaded_from: - returnedCommand.loadedFrom as AnalyticsMetadata_I_VERIFIED_THIS_IS_NOT_CODE_OR_FILEPATHS, + skill_loaded_from: returnedCommand.loadedFrom as AnalyticsMetadata_I_VERIFIED_THIS_IS_NOT_CODE_OR_FILEPATHS, }), ...(returnedCommand.kind && { - skill_kind: - returnedCommand.kind as AnalyticsMetadata_I_VERIFIED_THIS_IS_NOT_CODE_OR_FILEPATHS, + skill_kind: returnedCommand.kind as AnalyticsMetadata_I_VERIFIED_THIS_IS_NOT_CODE_OR_FILEPATHS, }), }), - }) + }); return { messages: [], shouldQuery: false, @@ -579,7 +596,8 @@ export async function processSlashCommand( model, nextInput, submitNextInput, - } + deferAutonomyCompletion, + }; } // For invalid commands, preserve both the user message and error @@ -591,15 +609,12 @@ export async function processSlashCommand( ) { // Don't log as invalid if it looks like a common file path const looksLikeFilePath = - inputString.startsWith('/var') || - inputString.startsWith('/tmp') || - inputString.startsWith('/private') + inputString.startsWith('/var') || inputString.startsWith('/tmp') || inputString.startsWith('/private'); if (!looksLikeFilePath) { logEvent('tengu_input_slash_invalid', { - input: - commandName as AnalyticsMetadata_I_VERIFIED_THIS_IS_NOT_CODE_OR_FILEPATHS, - }) + input: commandName as AnalyticsMetadata_I_VERIFIED_THIS_IS_NOT_CODE_OR_FILEPATHS, + }); } return { @@ -608,75 +623,58 @@ export async function processSlashCommand( allowedTools, model, - } + }; } // A valid command const eventData: Record = { - input: - sanitizedCommandName as AnalyticsMetadata_I_VERIFIED_THIS_IS_NOT_CODE_OR_FILEPATHS, - } + input: sanitizedCommandName as AnalyticsMetadata_I_VERIFIED_THIS_IS_NOT_CODE_OR_FILEPATHS, + }; // Add plugin metadata if this is a plugin command if (returnedCommand.type === 'prompt' && returnedCommand.pluginInfo) { - const { pluginManifest, repository } = returnedCommand.pluginInfo - const { marketplace } = parsePluginIdentifier(repository) - const isOfficial = isOfficialMarketplaceName(marketplace) - eventData._PROTO_plugin_name = - pluginManifest.name as AnalyticsMetadata_I_VERIFIED_THIS_IS_PII_TAGGED + const { pluginManifest, repository } = returnedCommand.pluginInfo; + const { marketplace } = parsePluginIdentifier(repository); + const isOfficial = isOfficialMarketplaceName(marketplace); + eventData._PROTO_plugin_name = pluginManifest.name as AnalyticsMetadata_I_VERIFIED_THIS_IS_PII_TAGGED; if (marketplace) { - eventData._PROTO_marketplace_name = - marketplace as AnalyticsMetadata_I_VERIFIED_THIS_IS_PII_TAGGED + eventData._PROTO_marketplace_name = marketplace as AnalyticsMetadata_I_VERIFIED_THIS_IS_PII_TAGGED; } eventData.plugin_repository = ( isOfficial ? repository : 'third-party' - ) as AnalyticsMetadata_I_VERIFIED_THIS_IS_NOT_CODE_OR_FILEPATHS + ) as AnalyticsMetadata_I_VERIFIED_THIS_IS_NOT_CODE_OR_FILEPATHS; eventData.plugin_name = ( isOfficial ? pluginManifest.name : 'third-party' - ) as AnalyticsMetadata_I_VERIFIED_THIS_IS_NOT_CODE_OR_FILEPATHS + ) as AnalyticsMetadata_I_VERIFIED_THIS_IS_NOT_CODE_OR_FILEPATHS; if (isOfficial && pluginManifest.version) { - eventData.plugin_version = - pluginManifest.version as AnalyticsMetadata_I_VERIFIED_THIS_IS_NOT_CODE_OR_FILEPATHS + eventData.plugin_version = pluginManifest.version as AnalyticsMetadata_I_VERIFIED_THIS_IS_NOT_CODE_OR_FILEPATHS; } - Object.assign( - eventData, - buildPluginCommandTelemetryFields(returnedCommand.pluginInfo), - ) + Object.assign(eventData, buildPluginCommandTelemetryFields(returnedCommand.pluginInfo)); } logEvent('tengu_input_command', { ...eventData, - invocation_trigger: - 'user-slash' as AnalyticsMetadata_I_VERIFIED_THIS_IS_NOT_CODE_OR_FILEPATHS, + invocation_trigger: 'user-slash' as AnalyticsMetadata_I_VERIFIED_THIS_IS_NOT_CODE_OR_FILEPATHS, ...(process.env.USER_TYPE === 'ant' && { - skill_name: - commandName as AnalyticsMetadata_I_VERIFIED_THIS_IS_NOT_CODE_OR_FILEPATHS, + skill_name: commandName as AnalyticsMetadata_I_VERIFIED_THIS_IS_NOT_CODE_OR_FILEPATHS, ...(returnedCommand.type === 'prompt' && { - skill_source: - returnedCommand.source as AnalyticsMetadata_I_VERIFIED_THIS_IS_NOT_CODE_OR_FILEPATHS, + skill_source: returnedCommand.source as AnalyticsMetadata_I_VERIFIED_THIS_IS_NOT_CODE_OR_FILEPATHS, }), ...(returnedCommand.loadedFrom && { - skill_loaded_from: - returnedCommand.loadedFrom as AnalyticsMetadata_I_VERIFIED_THIS_IS_NOT_CODE_OR_FILEPATHS, + skill_loaded_from: returnedCommand.loadedFrom as AnalyticsMetadata_I_VERIFIED_THIS_IS_NOT_CODE_OR_FILEPATHS, }), ...(returnedCommand.kind && { - skill_kind: - returnedCommand.kind as AnalyticsMetadata_I_VERIFIED_THIS_IS_NOT_CODE_OR_FILEPATHS, + skill_kind: returnedCommand.kind as AnalyticsMetadata_I_VERIFIED_THIS_IS_NOT_CODE_OR_FILEPATHS, }), }), - }) + }); // Check if this is a compact result which handle their own synthetic caveat message ordering - const isCompactResult = - newMessages.length > 0 && - newMessages[0] && - isCompactBoundaryMessage(newMessages[0]) + const isCompactResult = newMessages.length > 0 && newMessages[0] && isCompactBoundaryMessage(newMessages[0]); return { messages: - messageShouldQuery || - newMessages.every(isSystemLocalCommandMessage) || - isCompactResult + messageShouldQuery || newMessages.every(isSystemLocalCommandMessage) || isCompactResult ? newMessages : [createSyntheticUserCaveatMessage(), ...newMessages], shouldQuery: messageShouldQuery, @@ -686,7 +684,8 @@ export async function processSlashCommand( resultText, nextInput, submitNextInput, - } + deferAutonomyCompletion, + }; } async function getMessagesForSlashCommand( @@ -699,12 +698,13 @@ async function getMessagesForSlashCommand( _isAlreadyProcessing?: boolean, canUseTool?: CanUseToolFn, uuid?: string, + autonomy?: QueuedCommand['autonomy'], ): Promise { - const command = getCommand(commandName, context.options.commands) + const command = getCommand(commandName, context.options.commands); // Track skill usage for ranking (only for prompt commands that are user-invocable) if (command.type === 'prompt' && command.userInvocable !== false) { - recordSkillUsage(commandName) + recordSkillUsage(commandName); } // Check if the command is user-invocable @@ -724,25 +724,25 @@ async function getMessagesForSlashCommand( ], shouldQuery: false, command, - } + }; } try { switch (command.type) { case 'local-jsx': { return new Promise(resolve => { - let doneWasCalled = false + let doneWasCalled = false; const onDone = ( result?: string, options?: { - display?: CommandResultDisplay - shouldQuery?: boolean - metaMessages?: string[] - nextInput?: string - submitNextInput?: boolean + display?: CommandResultDisplay; + shouldQuery?: boolean; + metaMessages?: string[]; + nextInput?: string; + submitNextInput?: boolean; }, ) => { - doneWasCalled = true + doneWasCalled = true; // If display is 'skip', don't add any messages to the conversation if (options?.display === 'skip') { void resolve({ @@ -751,14 +751,14 @@ async function getMessagesForSlashCommand( command, nextInput: options?.nextInput, submitNextInput: options?.submitNextInput, - }) - return + }); + return; } // Meta messages are model-visible but hidden from the user - const metaMessages = (options?.metaMessages ?? []).map( - (content: string) => createUserMessage({ content, isMeta: true }), - ) + const metaMessages = (options?.metaMessages ?? []).map((content: string) => + createUserMessage({ content, isMeta: true }), + ); // In fullscreen the command just showed as a centered modal // pane — the transient notification is enough feedback. The @@ -771,9 +771,7 @@ async function getMessagesForSlashCommand( // usage, /rename, /proactive) use display:system for actual // output that must reach the transcript. const skipTranscript = - isFullscreenEnvEnabled() && - typeof result === 'string' && - result.endsWith(' dismissed') + isFullscreenEnvEnabled() && typeof result === 'string' && result.endsWith(' dismissed'); void resolve({ messages: @@ -781,12 +779,8 @@ async function getMessagesForSlashCommand( ? skipTranscript ? metaMessages : [ - createCommandInputMessage( - formatCommandInput(command, args), - ), - createCommandInputMessage( - `${result}`, - ), + createCommandInputMessage(formatCommandInput(command, args)), + createCommandInputMessage(`${result}`), ...metaMessages, ] : [ @@ -809,21 +803,21 @@ async function getMessagesForSlashCommand( command, nextInput: options?.nextInput, submitNextInput: options?.submitNextInput, - }) - } + }); + }; void command .load() .then(mod => mod.call(onDone, { ...context, canUseTool }, args)) .then(jsx => { - if (jsx == null) return + if (jsx == null) return; if (context.options.isNonInteractiveSession) { void resolve({ messages: [], shouldQuery: false, command, - }) - return + }); + return; } // Guard: if onDone fired during mod.call() (early-exit path // that calls onDone then returns JSX), skip setToolJSX. This @@ -832,51 +826,51 @@ async function getMessagesForSlashCommand( // its setToolJSX({clearLocalJSX: true}) before we get here. // Setting isLocalJSXCommand after clear leaves it stuck true, // blocking useQueueProcessor and TextInput focus. - if (doneWasCalled) return + if (doneWasCalled) return; setToolJSX({ jsx, shouldHidePromptInput: true, showSpinner: false, isLocalJSXCommand: true, isImmediate: command.immediate === true, - }) + }); }) .catch(e => { // If load()/call() throws and onDone never fired, the outer // Promise hangs forever, leaving queryGuard stuck in // 'dispatching' and deadlocking the queue processor. - logError(e) - if (doneWasCalled) return - doneWasCalled = true + logError(e); + if (doneWasCalled) return; + doneWasCalled = true; setToolJSX({ jsx: null, shouldHidePromptInput: false, clearLocalJSX: true, - }) - void resolve({ messages: [], shouldQuery: false, command }) - }) - }) + }); + void resolve({ messages: [], shouldQuery: false, command }); + }); + }); } case 'local': { - const displayArgs = command.isSensitive && args.trim() ? '***' : args + const displayArgs = command.isSensitive && args.trim() ? '***' : args; const userMessage = createUserMessage({ content: prepareUserContent({ inputString: formatCommandInput(command, displayArgs), precedingInputBlocks, }), - }) + }); try { - const syntheticCaveatMessage = createSyntheticUserCaveatMessage() - const mod = await command.load() - const result = await mod.call(args, context) + const syntheticCaveatMessage = createSyntheticUserCaveatMessage(); + const mod = await command.load(); + const result = await mod.call(args, context); if (result.type === 'skip') { return { messages: [], shouldQuery: false, command, - } + }; } // Use discriminated union to handle different result types @@ -899,52 +893,43 @@ async function getMessagesForSlashCommand( }), ] : []), - ] + ]; const compactionResultWithSlashMessages = { ...result.compactionResult, - messagesToKeep: [ - ...(result.compactionResult.messagesToKeep ?? []), - ...slashCommandMessages, - ], - } + messagesToKeep: [...(result.compactionResult.messagesToKeep ?? []), ...slashCommandMessages], + }; // Reset microcompact state since full compact replaces all // messages — old tool IDs are no longer relevant. Budget state // (on toolUseContext) needs no reset: stale entries are inert // (UUIDs never repeat, so they're never looked up). - resetMicrocompactState() + resetMicrocompactState(); return { - messages: buildPostCompactMessages( - compactionResultWithSlashMessages, - ) as AssistantMessage[], + messages: buildPostCompactMessages(compactionResultWithSlashMessages) as AssistantMessage[], shouldQuery: false, command, - } + }; } // Text result — use system message so it doesn't render as a user bubble return { messages: [ userMessage, - createCommandInputMessage( - `${result.value}`, - ), + createCommandInputMessage(`${result.value}`), ], shouldQuery: false, command, resultText: result.value, - } + }; } catch (e) { - logError(e) + logError(e); return { messages: [ userMessage, - createCommandInputMessage( - `${String(e)}`, - ), + createCommandInputMessage(`${String(e)}`), ], shouldQuery: false, command, - } + }; } } case 'prompt': { @@ -958,7 +943,8 @@ async function getMessagesForSlashCommand( precedingInputBlocks, setToolJSX, canUseTool ?? hasPermissionsToUseTool, - ) + autonomy, + ); } return await getMessagesForPromptSlashCommand( @@ -968,7 +954,7 @@ async function getMessagesForSlashCommand( precedingInputBlocks, imageContentBlocks, uuid, - ) + ); } catch (e) { // Handle abort errors specially to show proper "Interrupted" message if (e instanceof AbortError) { @@ -984,7 +970,7 @@ async function getMessagesForSlashCommand( ], shouldQuery: false, command, - } + }; } return { messages: [ @@ -1000,7 +986,7 @@ async function getMessagesForSlashCommand( ], shouldQuery: false, command, - } + }; } } } @@ -1017,46 +1003,40 @@ async function getMessagesForSlashCommand( ], shouldQuery: false, command, - } + }; } - throw e + throw e; } } function formatCommandInput(command: CommandBase, args: string): string { - return formatCommandInputTags(getCommandName(command), args) + return formatCommandInputTags(getCommandName(command), args); } /** * Formats the metadata for a skill loading message. * Used by the Skill tool and for subagent skill preloading. */ -export function formatSkillLoadingMetadata( - skillName: string, - _progressMessage: string = 'loading', -): string { +export function formatSkillLoadingMetadata(skillName: string, _progressMessage: string = 'loading'): string { // Use skill name only - UserCommandMessage renders as "Skill(name)" return [ `<${COMMAND_MESSAGE_TAG}>${skillName}`, `<${COMMAND_NAME_TAG}>${skillName}`, `true`, - ].join('\n') + ].join('\n'); } /** * Formats the metadata for a slash command loading message. */ -function formatSlashCommandLoadingMetadata( - commandName: string, - args?: string, -): string { +function formatSlashCommandLoadingMetadata(commandName: string, args?: string): string { return [ `<${COMMAND_MESSAGE_TAG}>${commandName}`, `<${COMMAND_NAME_TAG}>/${commandName}`, args ? `${args}` : null, ] .filter(Boolean) - .join('\n') + .join('\n'); } /** @@ -1064,26 +1044,19 @@ function formatSlashCommandLoadingMetadata( * User-invocable skills use slash command format (/name), while model-only * skills use the skill format ("The X skill is running"). */ -function formatCommandLoadingMetadata( - command: CommandBase & PromptCommand, - args?: string, -): string { +function formatCommandLoadingMetadata(command: CommandBase & PromptCommand, args?: string): string { // Use command.name (the qualified name including plugin prefix, e.g. // "product-management:feature-spec") instead of userFacingName() which may // strip the plugin prefix via displayName fallback. // User-invocable skills should show as /command-name like regular slash commands if (command.userInvocable !== false) { - return formatSlashCommandLoadingMetadata(command.name, args) + return formatSlashCommandLoadingMetadata(command.name, args); } // Model-only skills (userInvocable: false) show as "The X skill is running" - if ( - command.loadedFrom === 'skills' || - command.loadedFrom === 'plugin' || - command.loadedFrom === 'mcp' - ) { - return formatSkillLoadingMetadata(command.name, command.progressMessage) + if (command.loadedFrom === 'skills' || command.loadedFrom === 'plugin' || command.loadedFrom === 'mcp') { + return formatSkillLoadingMetadata(command.name, command.progressMessage); } - return formatSlashCommandLoadingMetadata(command.name, args) + return formatSlashCommandLoadingMetadata(command.name, args); } export async function processPromptSlashCommand( @@ -1093,22 +1066,16 @@ export async function processPromptSlashCommand( context: ToolUseContext, imageContentBlocks: ContentBlockParam[] = [], ): Promise { - const command = findCommand(commandName, commands) + const command = findCommand(commandName, commands); if (!command) { - throw new MalformedCommandError(`Unknown command: ${commandName}`) + throw new MalformedCommandError(`Unknown command: ${commandName}`); } if (command.type !== 'prompt') { throw new Error( `Unexpected ${command.type} command. Expected 'prompt' command. Use /${commandName} directly in the main conversation.`, - ) + ); } - return getMessagesForPromptSlashCommand( - command, - args, - context, - [], - imageContentBlocks, - ) + return getMessagesForPromptSlashCommand(command, args, context, [], imageContentBlocks); } async function getMessagesForPromptSlashCommand( @@ -1128,33 +1095,23 @@ async function getMessagesForPromptSlashCommand( // parent env, so we also check !context.agentId: agentId is only set for // subagents, letting workers fall through to getPromptForCommand and receive // the real skill content when they invoke the Skill tool. - if ( - feature('COORDINATOR_MODE') && - isEnvTruthy(process.env.CLAUDE_CODE_COORDINATOR_MODE) && - !context.agentId - ) { - const metadata = formatCommandLoadingMetadata(command, args) - const parts: string[] = [ - `Skill "/${command.name}" is available for workers.`, - ] + if (feature('COORDINATOR_MODE') && isEnvTruthy(process.env.CLAUDE_CODE_COORDINATOR_MODE) && !context.agentId) { + const metadata = formatCommandLoadingMetadata(command, args); + const parts: string[] = [`Skill "/${command.name}" is available for workers.`]; if (command.description) { - parts.push(`Description: ${command.description}`) + parts.push(`Description: ${command.description}`); } if (command.whenToUse) { - parts.push(`When to use: ${command.whenToUse}`) + parts.push(`When to use: ${command.whenToUse}`); } - const skillAllowedTools = command.allowedTools ?? [] + const skillAllowedTools = command.allowedTools ?? []; if (skillAllowedTools.length > 0) { - parts.push( - `This skill grants workers additional tool permissions: ${skillAllowedTools.join(', ')}`, - ) + parts.push(`This skill grants workers additional tool permissions: ${skillAllowedTools.join(', ')}`); } parts.push( `\nInstruct a worker to use this skill by including "Use the /${command.name} skill" in your Agent prompt. The worker has access to the Skill tool and will receive the skill's content and permissions when it invokes it.`, - ) - const summaryContent: ContentBlockParam[] = [ - { type: 'text', text: parts.join('\n') }, - ] + ); + const summaryContent: ContentBlockParam[] = [{ type: 'text', text: parts.join('\n') }]; return { messages: [ createUserMessage({ content: metadata, uuid }), @@ -1164,55 +1121,45 @@ async function getMessagesForPromptSlashCommand( model: command.model, effort: command.effort, command, - } + }; } - const result = await command.getPromptForCommand(args, context) + const result = await command.getPromptForCommand(args, context); // Register skill hooks if defined. Under ["hooks"]-only (skills not locked), // user skills still load and reach this point — block hook REGISTRATION here // where source is known. Mirrors the agent frontmatter gate in runAgent.ts. - const hooksAllowedForThisSkill = - !isRestrictedToPluginOnly('hooks') || isSourceAdminTrusted(command.source) + const hooksAllowedForThisSkill = !isRestrictedToPluginOnly('hooks') || isSourceAdminTrusted(command.source); if (command.hooks && hooksAllowedForThisSkill) { - const sessionId = getSessionId() + const sessionId = getSessionId(); registerSkillHooks( context.setAppState, sessionId, command.hooks, command.name, command.type === 'prompt' ? command.skillRoot : undefined, - ) + ); } // Record skill invocation for compaction preservation, scoped by agent context. // Skills are tagged with their agentId so only skills belonging to the current // agent are restored during compaction (preventing cross-agent leaks). - const skillPath = command.source - ? `${command.source}:${command.name}` - : command.name + const skillPath = command.source ? `${command.source}:${command.name}` : command.name; const skillContent = result .filter((b): b is TextBlockParam => b.type === 'text') .map(b => b.text) - .join('\n\n') - addInvokedSkill( - command.name, - skillPath, - skillContent, - getAgentContext()?.agentId ?? null, - ) + .join('\n\n'); + addInvokedSkill(command.name, skillPath, skillContent, getAgentContext()?.agentId ?? null); - const metadata = formatCommandLoadingMetadata(command, args) + const metadata = formatCommandLoadingMetadata(command, args); - const additionalAllowedTools = parseToolListFromCLI( - command.allowedTools ?? [], - ) + const additionalAllowedTools = parseToolListFromCLI(command.allowedTools ?? []); // Create content for the main message, including any pasted images const mainMessageContent: ContentBlockParam[] = imageContentBlocks.length > 0 || precedingInputBlocks.length > 0 ? [...imageContentBlocks, ...precedingInputBlocks, ...result] - : result + : result; // Extract attachments from command arguments (@-mentions, MCP resources, // agent mentions in SKILL.md). skipSkillDiscovery prevents the SKILL.md @@ -1232,7 +1179,7 @@ async function getMessagesForPromptSlashCommand( 'repl_main_thread', { skipSkillDiscovery: true }, ), - ) + ); const messages = [ createUserMessage({ @@ -1249,7 +1196,7 @@ async function getMessagesForPromptSlashCommand( allowedTools: additionalAllowedTools, model: command.model, }), - ] + ]; return { messages, @@ -1258,5 +1205,5 @@ async function getMessagesForPromptSlashCommand( model: command.model, effort: command.effort, command, - } + }; } diff --git a/src/utils/processUserInput/processUserInput.ts b/src/utils/processUserInput/processUserInput.ts index 94682aebfb..e625eeea4c 100644 --- a/src/utils/processUserInput/processUserInput.ts +++ b/src/utils/processUserInput/processUserInput.ts @@ -28,6 +28,7 @@ import type { import type { PermissionMode } from '../../types/permissions.js' import { isValidImagePaste, + type QueuedCommand, type PromptInputMode, } from '../../types/textInputTypes.js' import { @@ -80,6 +81,9 @@ export type ProcessUserInputBaseResult = { // Used by /discover to chain into the selected feature's command nextInput?: string submitNextInput?: boolean + // When true, the command started detached work that will finalize its + // autonomy run after the background work completes. + deferAutonomyCompletion?: boolean } export async function processUserInput({ @@ -100,6 +104,7 @@ export async function processUserInput({ bridgeOrigin, isMeta, skipAttachments, + autonomy, }: { input: string | Array /** @@ -137,6 +142,7 @@ export async function processUserInput({ */ isMeta?: boolean skipAttachments?: boolean + autonomy?: QueuedCommand['autonomy'] }): Promise { const inputString = typeof input === 'string' ? input : null // Immediately show the user input prompt while we are still processing the input. @@ -168,6 +174,7 @@ export async function processUserInput({ isMeta, skipAttachments, preExpansionInput, + autonomy, ) queryCheckpoint('query_process_user_input_base_end') @@ -296,6 +303,7 @@ async function processUserInputBase( isMeta?: boolean, skipAttachments?: boolean, preExpansionInput?: string, + autonomy?: QueuedCommand['autonomy'], ): Promise { let inputString: string | null = null let precedingInputBlocks: ContentBlockParam[] = [] @@ -491,6 +499,7 @@ async function processUserInputBase( uuid, isAlreadyProcessing, canUseTool, + autonomy, ) return addImageMetadataMessage(slashResult, imageMetadataTexts) } @@ -549,6 +558,7 @@ async function processUserInputBase( uuid, isAlreadyProcessing, canUseTool, + autonomy, ) return addImageMetadataMessage(slashResult, imageMetadataTexts) } diff --git a/src/utils/swarm/inProcessRunner.ts b/src/utils/swarm/inProcessRunner.ts index 5320fd2940..f01582ea16 100644 --- a/src/utils/swarm/inProcessRunner.ts +++ b/src/utils/swarm/inProcessRunner.ts @@ -424,8 +424,7 @@ function createInProcessCanUseTool( feedback: parsed.error, }) } - cleanup() - return + return // Callback already resolves the promise } } } @@ -675,6 +674,7 @@ type WaitResult = type: 'new_message' message: string autonomyRunId?: string + autonomyRootDir?: string from: string color?: string summary?: string @@ -739,12 +739,16 @@ async function waitForNextPromptOrShutdown( `[inProcessRunner] ${identity.agentName} found pending user message (poll #${pollCount})`, ) if (pending.autonomyRunId) { - await markAutonomyRunRunning(pending.autonomyRunId) + await markAutonomyRunRunning( + pending.autonomyRunId, + pending.autonomyRootDir, + ) } return { type: 'new_message', message: pending.message, autonomyRunId: pending.autonomyRunId, + autonomyRootDir: pending.autonomyRootDir, from: 'user', } } @@ -1022,6 +1026,7 @@ export async function runInProcessTeammate( ) let currentPrompt = wrappedInitialPrompt let currentAutonomyRunId: string | undefined + let currentAutonomyRootDir: string | undefined let shouldExit = false // Try to claim an available task immediately so the UI can show activity @@ -1319,12 +1324,21 @@ export async function runInProcessTeammate( setAppState, ) if (currentAutonomyRunId) { - await markAutonomyRunFailed(currentAutonomyRunId, ERROR_MESSAGE_USER_ABORT) + await markAutonomyRunFailed( + currentAutonomyRunId, + ERROR_MESSAGE_USER_ABORT, + currentAutonomyRootDir, + ) currentAutonomyRunId = undefined + currentAutonomyRootDir = undefined } } else if (currentAutonomyRunId) { - await markAutonomyRunCompleted(currentAutonomyRunId) + await markAutonomyRunCompleted( + currentAutonomyRunId, + currentAutonomyRootDir, + ) currentAutonomyRunId = undefined + currentAutonomyRootDir = undefined } // Check if already idle before updating (to skip duplicate notification) @@ -1398,6 +1412,7 @@ export async function runInProcessTeammate( setAppState, ) currentAutonomyRunId = undefined + currentAutonomyRootDir = undefined break case 'new_message': @@ -1410,6 +1425,7 @@ export async function runInProcessTeammate( if (waitResult.from === 'user') { currentPrompt = waitResult.message currentAutonomyRunId = waitResult.autonomyRunId + currentAutonomyRootDir = waitResult.autonomyRootDir } else { currentPrompt = formatAsTeammateMessage( waitResult.from, @@ -1426,6 +1442,7 @@ export async function runInProcessTeammate( setAppState, ) currentAutonomyRunId = undefined + currentAutonomyRootDir = undefined } break @@ -1533,7 +1550,11 @@ export async function runInProcessTeammate( }) } if (currentAutonomyRunId) { - await markAutonomyRunFailed(currentAutonomyRunId, errorMessage) + await markAutonomyRunFailed( + currentAutonomyRunId, + errorMessage, + currentAutonomyRootDir, + ) } // Send idle notification with failure via file-based mailbox diff --git a/src/utils/swarm/spawnInProcess.ts b/src/utils/swarm/spawnInProcess.ts index 5cfa0ab5aa..77768b67ca 100644 --- a/src/utils/swarm/spawnInProcess.ts +++ b/src/utils/swarm/spawnInProcess.ts @@ -234,7 +234,7 @@ export function killInProcessTeammate( let agentId: string | null = null let toolUseId: string | undefined let description: string | undefined - let pendingAutonomyRunIds: string[] = [] + let pendingAutonomyRuns: Array<{ runId: string; rootDir?: string }> = [] setAppState((prev: AppState) => { const task = prev.tasks[taskId] @@ -255,9 +255,18 @@ export function killInProcessTeammate( description = teammateTask.description // Capture pending autonomy run IDs before clearing them - pendingAutonomyRunIds = teammateTask.pendingUserMessages - .map(message => message.autonomyRunId) - .filter((runId): runId is string => runId !== undefined) + pendingAutonomyRuns = teammateTask.pendingUserMessages.flatMap(message => + message.autonomyRunId + ? [ + { + runId: message.autonomyRunId, + ...(message.autonomyRootDir + ? { rootDir: message.autonomyRootDir } + : {}), + }, + ] + : [], + ) // Abort the controller to stop execution teammateTask.abortController?.abort() @@ -311,10 +320,11 @@ export function killInProcessTeammate( } if (killed) { - for (const runId of pendingAutonomyRunIds) { + for (const run of pendingAutonomyRuns) { void markAutonomyRunFailed( - runId, + run.runId, `Teammate ${agentId ?? taskId} was stopped before it could consume the queued autonomy prompt.`, + run.rootDir, ) } void evictTaskOutput(taskId) diff --git a/tests/integration/autonomy-lifecycle-user-flow.test.ts b/tests/integration/autonomy-lifecycle-user-flow.test.ts new file mode 100644 index 0000000000..b9e7bd172e --- /dev/null +++ b/tests/integration/autonomy-lifecycle-user-flow.test.ts @@ -0,0 +1,148 @@ +import { afterEach, beforeEach, describe, expect, test } from 'bun:test' +import { existsSync, mkdtempSync, rmSync } from 'node:fs' +import { tmpdir } from 'node:os' +import { join, resolve } from 'node:path' +import { + resetStateForTests, + setOriginalCwd, + setProjectRoot, +} from '../../src/bootstrap/state' +import { + listAutonomyRuns, + startManagedAutonomyFlowFromHeartbeatTask, +} from '../../src/utils/autonomyRuns' +import { listAutonomyFlows } from '../../src/utils/autonomyFlows' + +const CLI_ENTRYPOINT = resolve(import.meta.dir, '../../src/entrypoints/cli.tsx') + +let tempDir = '' +let configDir = '' +let previousConfigDir: string | undefined + +async function runAutonomyCli(args: string[]): Promise { + const proc = Bun.spawn({ + cmd: [process.execPath, CLI_ENTRYPOINT, 'autonomy', ...args], + cwd: tempDir, + env: { + ...process.env, + CLAUDE_CONFIG_DIR: configDir, + CI: 'true', + GITHUB_ACTIONS: 'true', + NODE_ENV: 'development', + NO_COLOR: '1', + }, + stdin: 'ignore', + stdout: 'pipe', + stderr: 'pipe', + }) + + const [stdout, stderr, exitCode] = await Promise.all([ + new Response(proc.stdout).text(), + new Response(proc.stderr).text(), + proc.exited, + ]) + + expect(stderr, `unexpected stderr output:\n${stderr}`).toBe('') + expect(exitCode, `non-zero exit ${exitCode}; stderr:\n${stderr}`).toBe(0) + return stdout +} + +beforeEach(() => { + tempDir = mkdtempSync(join(tmpdir(), 'autonomy-user-flow-')) + configDir = join(tempDir, 'config') + previousConfigDir = process.env.CLAUDE_CONFIG_DIR + process.env.CLAUDE_CONFIG_DIR = configDir + resetStateForTests() + setOriginalCwd(tempDir) + setProjectRoot(tempDir) +}) + +afterEach(() => { + resetStateForTests() + if (previousConfigDir === undefined) { + delete process.env.CLAUDE_CONFIG_DIR + } else { + process.env.CLAUDE_CONFIG_DIR = previousConfigDir + } + if (tempDir) { + rmSync(tempDir, { recursive: true, force: true }) + } +}) + +describe('autonomy lifecycle user-equivalent CLI flow', () => { + test('status --deep works from a clean project without creating autonomy state', async () => { + const output = await runAutonomyCli(['status', '--deep']) + + expect(output).toContain('# Autonomy Deep Status') + expect(output).toContain('Autonomy runs: 0') + expect(output).toContain('Autonomy flows: 0') + expect(existsSync(join(tempDir, '.claude', 'autonomy', 'runs.json'))).toBe( + false, + ) + expect(existsSync(join(tempDir, '.claude', 'autonomy', 'flows.json'))).toBe( + false, + ) + }) + + test('real CLI can inspect, resume, and cancel a persisted managed flow', async () => { + await startManagedAutonomyFlowFromHeartbeatTask({ + rootDir: tempDir, + currentDir: tempDir, + task: { + name: 'manual-user-flow', + interval: '1h', + prompt: 'Manual lifecycle acceptance', + steps: [ + { + name: 'approve', + prompt: 'Wait for manual approval', + waitFor: 'manual', + }, + { + name: 'execute', + prompt: 'Execute approved work', + }, + ], + }, + }) + const [waitingFlow] = await listAutonomyFlows(tempDir) + expect(waitingFlow?.status).toBe('waiting') + + const status = await runAutonomyCli(['status', '--deep']) + expect(status).toContain('Autonomy flows: 1') + expect(status).toContain('Waiting: 1') + + const flows = await runAutonomyCli(['flows', '5']) + expect(flows).toContain(waitingFlow!.flowId) + expect(flows).toContain('waiting') + + const detailBefore = await runAutonomyCli(['flow', waitingFlow!.flowId]) + expect(detailBefore).toContain('Status: waiting') + expect(detailBefore).toContain('Current step: approve') + + const resume = await runAutonomyCli(['flow', 'resume', waitingFlow!.flowId]) + expect(resume).toContain('Prepared the next managed step') + expect(resume).toContain('Prompt:') + + const detailAfterResume = await runAutonomyCli([ + 'flow', + waitingFlow!.flowId, + ]) + expect(detailAfterResume).toContain('Status: queued') + expect(detailAfterResume).toContain('Latest run:') + + const cancel = await runAutonomyCli(['flow', 'cancel', waitingFlow!.flowId]) + expect(cancel).toContain('Cancelled flow') + + const [cancelledRun] = await listAutonomyRuns(tempDir) + const [cancelledFlow] = await listAutonomyFlows(tempDir) + expect(cancelledRun?.status).toBe('cancelled') + expect(cancelledFlow?.status).toBe('cancelled') + + const detailAfterCancel = await runAutonomyCli([ + 'flow', + waitingFlow!.flowId, + ]) + expect(detailAfterCancel).toContain('Status: cancelled') + }, 30000) +}) diff --git a/tests/integration/dependency-overrides.test.ts b/tests/integration/dependency-overrides.test.ts index b1549e82b3..65a6679527 100644 --- a/tests/integration/dependency-overrides.test.ts +++ b/tests/integration/dependency-overrides.test.ts @@ -2,13 +2,42 @@ import { describe, expect, test } from 'bun:test' import { mkdtempSync, rmSync, writeFileSync } from 'node:fs' import { createRequire } from 'node:module' import { tmpdir } from 'node:os' -import { join, resolve } from 'node:path' +import { dirname, join, resolve } from 'node:path' import { pathToFileURL } from 'node:url' const repoRoot = resolve(import.meta.dir, '..', '..') const uuidV4Pattern = /^[0-9a-f]{8}-[0-9a-f]{4}-4[0-9a-f]{3}-[89ab][0-9a-f]{3}-[0-9a-f]{12}$/ +async function findPackageJson( + startPath: string, + expectedName: string, +): Promise { + let current = dirname(startPath) + for (let depth = 0; depth < 10; depth++) { + const candidate = join(current, 'package.json') + const file = Bun.file(candidate) + if (await file.exists()) { + try { + const parsed = JSON.parse(await file.text()) as { name?: unknown } + if (parsed.name === expectedName) { + return candidate + } + } catch { + // ignore parse errors and keep walking up + } + } + const parent = dirname(current) + if (parent === current) { + break + } + current = parent + } + throw new Error( + `package.json with name "${expectedName}" not found above ${startPath}`, + ) +} + describe('dependency security overrides', () => { test('mcpb can load patched inquirer prompts from its package context', async () => { const mcpbRequire = createRequire(import.meta.resolve('@anthropic-ai/mcpb')) @@ -28,10 +57,7 @@ describe('dependency security overrides', () => { ) const gaxios = vertexRequire('gaxios') as { request(options: { - adapter(options: { - headers: Headers - url: string - }): Promise<{ + adapter(options: { headers: Headers; url: string }): Promise<{ config: unknown data: string headers: Record @@ -39,7 +65,7 @@ describe('dependency security overrides', () => { status: number statusText: string }> - multipart: Array<{ body: string; headers: Record }> + multipart: Array<{ body: string; headers: Record }> url: string }): Promise<{ status: number }> } @@ -47,8 +73,10 @@ describe('dependency security overrides', () => { const response = await gaxios.request({ url: 'https://example.com/upload', - multipart: [{ body: 'payload', headers: { 'Content-Type': 'text/plain' } }], - adapter: async (options) => { + multipart: [ + { body: 'payload', headers: { 'Content-Type': 'text/plain' } }, + ], + adapter: async options => { contentType = options.headers.get('content-type') ?? undefined return { config: options, @@ -62,14 +90,14 @@ describe('dependency security overrides', () => { }) expect(response.status).toBe(200) - expect(contentType).toMatch( - /^multipart\/related; boundary=[0-9a-f-]{36}$/, - ) + expect(contentType).toMatch(/^multipart\/related; boundary=[0-9a-f-]{36}$/) expect(contentType?.split('boundary=')[1]).toMatch(uuidV4Pattern) }) test('azure identity msal guid generation works through its package context', () => { - const identityRequire = createRequire(import.meta.resolve('@azure/identity')) + const identityRequire = createRequire( + import.meta.resolve('@azure/identity'), + ) const msal = identityRequire('@azure/msal-node') as { CryptoProvider: new () => { createNewGuid(): string } } @@ -78,7 +106,7 @@ describe('dependency security overrides', () => { expect(cryptoProvider.createNewGuid()).toMatch(uuidV4Pattern) }) - test('remote control markdown renderer loads streamdown and mermaid', async () => { + test('remote control markdown renderer resolves streamdown and mermaid', async () => { const rcsRequire = createRequire( join(repoRoot, 'packages/remote-control-server/package.json'), ) @@ -90,13 +118,26 @@ describe('dependency security overrides', () => { const uuid = (await import( pathToFileURL(streamdownRequire.resolve('uuid')).href )) as { v4(): string } - const mermaid = (await import( - pathToFileURL(streamdownRequire.resolve('mermaid')).href - )) as { default?: { initialize?: unknown } } + const mermaidPath = streamdownRequire.resolve('mermaid') + // mermaid does not export ./package.json in its exports map, so resolving + // 'mermaid/package.json' throws ERR_PACKAGE_PATH_NOT_EXPORTED in runtimes + // that honor exports semantics. Walk up from the resolved entry until a + // package.json with name === 'mermaid' is found. + const mermaidPackagePath = await findPackageJson(mermaidPath, 'mermaid') + const mermaidPackage = JSON.parse( + await Bun.file(mermaidPackagePath).text(), + ) as { + name?: unknown + exports?: { '.'?: { import?: unknown } } + } expect(streamdown.Streamdown).toBeDefined() expect(uuid.v4()).toMatch(uuidV4Pattern) - expect(typeof mermaid.default?.initialize).toBe('function') + expect(mermaidPackage.name).toBe('mermaid') + expect(mermaidPath).toContain('mermaid.core.mjs') + expect(mermaidPackage.exports?.['.']?.import).toBe( + './dist/mermaid.core.mjs', + ) }) test('grpc proto-loader keeps its protobuf 7 parser path working', () => { diff --git a/tests/mocks/auth.ts b/tests/mocks/auth.ts new file mode 100644 index 0000000000..7c0da17a75 --- /dev/null +++ b/tests/mocks/auth.ts @@ -0,0 +1,31 @@ +/** + * Shared mock for `src/utils/auth.js`. Use it via: + * + * import { authMock } from '../../tests/mocks/auth' + * mock.module('src/utils/auth.js', authMock) + * + * Tests that need different return values can override the helper used by + * the suite (e.g. by extending this object and re-registering with mock.module). + * Always extend here rather than inlining a different shape per test, so the + * surface stays consistent when `auth.ts` exports change. + */ +export const authMock = () => ({ + // Mirrors the production contract: src/utils/auth.ts returns + // Promise ("did the access token change") and a token object that + // carries scopes, subscriptionType, expiresAt, etc. Tests that branch on + // these values must see the full shape so they can not silently drift away + // from production. + checkAndRefreshOAuthTokenIfNeeded: async () => false, + getClaudeAIOAuthTokens: () => ({ + accessToken: 'token', + refreshToken: null, + expiresAt: null, + scopes: ['user:inference'], + subscriptionType: null, + rateLimitTier: null, + }), + isClaudeAISubscriber: () => true, + isProSubscriber: () => false, + isMaxSubscriber: () => false, + isTeamSubscriber: () => false, +}) diff --git a/tests/mocks/file-system.ts b/tests/mocks/file-system.ts index e356ec0155..c46defc6c7 100644 --- a/tests/mocks/file-system.ts +++ b/tests/mocks/file-system.ts @@ -30,3 +30,21 @@ export async function createTempSubdir( await mkdir(path, { recursive: true }) return path } + +/** + * Read a file under the test temp dir as utf-8 text. Mirrors the node:fs + * `readFileSync(path, 'utf-8')` ergonomics but uses Bun's native file API so + * tests stay on the Bun-only runtime contract. + */ +export async function readTempFile(path: string): Promise { + return Bun.file(path).text() +} + +/** + * Best-effort existence check for a path under the test temp dir. Uses Bun's + * native file API (works for files; directories return true via Bun.file().exists() + * iff the path resolves — reads directly from the filesystem). + */ +export async function tempPathExists(path: string): Promise { + return Bun.file(path).exists() +}