feat(runner): serialize start-sync invocations via setup-window lock

heavygee · cursoragent · heavygee · commit ec8e06a44a2f · 2026-05-31T23:22:12.000+01:00
The mtime-driven self-restart + the operator drop-in's ExecStartPre
together created a race window where two `hapi runner start-sync`
invocations could each see the other's runner as stale, kill it, and
then race for the runtime lock. The 2026-05-31 22:40 BST incident
captured the worst-case outcome: invocation A killed the live runner,
took SIGTERM ~200ms later from invocation B's setup, and exited
cleanly - leaving the machine offline because systemd's
Restart=on-failure does not recover from exit code 0.

Add a separate `runner.start.lock` held by `startRunner()` for the
duration of its setup window only (version check + stopRunner + new
runtime-lock acquisition + state-file write). Released as soon as the
state file is owned, so legitimate next-invocations only delay a few
hundred ms. Stale-cleanup after 15s in case a setup crashes between
acquire and release.

When a second start-sync arrives while the first is mid-setup:
  1. It blocks on acquireRunnerStartLock for up to ~6s.
  2. By release, the first invocation owns the runtime lock + state
     file with the current version.
  3. The blocked invocation's subsequent
     isRunnerRunningCurrentlyInstalledHappyVersion() returns
     "matching", so it exits cleanly via the existing
     "Runner already running with matching version" path - no kill,
     no race.

If the lock cannot be acquired within the window (another invocation
is genuinely stuck), bail out instead of compounding the problem.

Co-authored-by: Cursor &lt;cursoragent@cursor.com&gt;
diff --git a/cli/src/configuration.ts b/cli/src/configuration.ts
@@ -52,6 +52,7 @@ class Configuration {
     public readonly privateKeyFile: string
     public readonly runnerStateFile: string
     public readonly runnerLockFile: string
+    public readonly runnerStartLockFile: string
     public readonly currentCliVersion: string
 
     public readonly isExperimentalEnabled: boolean
@@ -80,6 +81,14 @@ class Configuration {
         this.privateKeyFile = join(this.happyHomeDir, 'access.key')
         this.runnerStateFile = join(this.happyHomeDir, 'runner.state.json')
         this.runnerLockFile = join(this.happyHomeDir, 'runner.state.json.lock')
+        // Held by `startRunner()` during its setup window only (version check +
+        // stopRunner + new runtime-lock acquisition + state-file write). NOT
+        // held for the runner's lifetime - that's runnerLockFile's job. The
+        // start lock serialises concurrent `runner start-sync` invocations so
+        // they cannot race past each other into the kill-old / start-new
+        // sequence. Stale-clean via mtime: any holder older than 15s is
+        // assumed dead (the setup window is well under 1s on healthy hosts).
+        this.runnerStartLockFile = join(this.happyHomeDir, 'runner.start.lock')
 
         this.isExperimentalEnabled = ['true', '1', 'yes'].includes(process.env.HAPI_EXPERIMENTAL?.toLowerCase() || '')
 
diff --git a/cli/src/persistence.ts b/cli/src/persistence.ts
@@ -274,3 +274,73 @@ export async function releaseRunnerLock(lockHandle: FileHandle): Promise<void> {
     }
   } catch { }
 }
+
+/**
+ * Acquire the runner *start* lock - held by `startRunner()` for the duration
+ * of its setup window (version check + stopRunner + new runtime-lock + state
+ * write). Returns the open FileHandle on success, or null when another
+ * start-sync invocation is already inside its setup window.
+ *
+ * This is intentionally separate from `acquireRunnerLock()`:
+ *   * runnerLockFile (runner.state.json.lock) is held for the *lifetime* of a
+ *     running runner - it's the "I am the live runner" marker.
+ *   * runnerStartLockFile (runner.start.lock) is held only during the few-ms
+ *     setup window of a `startRunner()` invocation - it serialises *invocations*
+ *     so two `hapi runner start-sync` calls cannot race past each other into
+ *     the kill-old / start-new sequence (which caused the 2026-05-31 22:40
+ *     incident: invocation A killed the live runner; invocation B's racing
+ *     SIGTERM then knocked A over before its replacement could register).
+ *
+ * Stale cleanup: lock file mtime > staleAfterMs (default 15s) is treated as
+ * abandoned and removed. The genuine setup window completes in well under 1s
+ * even on slow hardware, so 15s is comfortably loose.
+ */
+export async function acquireRunnerStartLock(options: {
+  maxAttempts?: number;
+  delayMs?: number;
+  staleAfterMs?: number;
+} = {}): Promise<FileHandle | null> {
+  const maxAttempts = options.maxAttempts ?? 30;       // up to ~6s of waiting
+  const delayMs = options.delayMs ?? 200;
+  const staleAfterMs = options.staleAfterMs ?? 15_000;
+
+  for (let attempt = 1; attempt <= maxAttempts; attempt++) {
+    try {
+      const fileHandle = await open(configuration.runnerStartLockFile, 'wx');
+      await fileHandle.writeFile(`${process.pid} ${new Date().toISOString()}`);
+      return fileHandle;
+    } catch (error: any) {
+      if (error.code !== 'EEXIST') {
+        throw error;
+      }
+      // Stale-cleanup probe before sleeping.
+      try {
+        const stats = await stat(configuration.runnerStartLockFile);
+        if (Date.now() - stats.mtimeMs > staleAfterMs) {
+          await unlink(configuration.runnerStartLockFile).catch(() => { });
+          continue;
+        }
+      } catch { /* lock vanished between EEXIST and stat - retry */ }
+      if (attempt === maxAttempts) {
+        return null;
+      }
+      await new Promise(resolve => setTimeout(resolve, delayMs));
+    }
+  }
+  return null;
+}
+
+/**
+ * Release the start lock acquired via acquireRunnerStartLock.
+ * Idempotent on missing files.
+ */
+export async function releaseRunnerStartLock(lockHandle: FileHandle): Promise<void> {
+  try {
+    await lockHandle.close();
+  } catch { }
+  try {
+    if (existsSync(configuration.runnerStartLockFile)) {
+      unlinkSync(configuration.runnerStartLockFile);
+    }
+  } catch { }
+}
diff --git a/cli/src/runner/run.ts b/cli/src/runner/run.ts
@@ -11,7 +11,7 @@ import { configuration } from '@/configuration';
 import packageJson from '../../package.json';
 import { getEnvironmentInfo } from '@/ui/doctor';
 import { spawnHappyCLI } from '@/utils/spawnHappyCLI';
-import { writeRunnerState, RunnerLocallyPersistedState, readRunnerState, readSettings, acquireRunnerLock, releaseRunnerLock } from '@/persistence';
+import { writeRunnerState, RunnerLocallyPersistedState, readRunnerState, readSettings, acquireRunnerLock, releaseRunnerLock, acquireRunnerStartLock, releaseRunnerStartLock } from '@/persistence';
 import { isProcessAlive, isWindows, killProcess, killProcessByChildProcess } from '@/utils/process';
 import { PERMISSION_MODES } from '@hapi/protocol/modes';
 import { withRetry } from '@/utils/time';
@@ -98,6 +98,38 @@ export async function startRunner(options: { workspaceRoots?: string[] } = {}):
   logger.debug('[RUNNER RUN] Starting runner process...');
   logger.debugLargeJson('[RUNNER RUN] Environment', getEnvironmentInfo());
 
+  // Serialise concurrent `runner start-sync` invocations. The 2026-05-31 22:40
+  // BST incident reproduced what happens without this lock: a terminal-launched
+  // start-sync (no HAPI_DISABLE_VERSION_HANDOFF) killed the systemd-owned live
+  // runner via stopRunner(), then took an external SIGTERM ~200ms later before
+  // its replacement could register - net result was a 3-minute machine outage.
+  //
+  // With the start lock held: if any other startRunner() is already inside its
+  // setup window (kill-old + acquire-runtime-lock + write-state), we wait up
+  // to ~6s for it to release. By then the winning invocation has a running
+  // runner; our subsequent isRunnerRunningCurrentlyInstalledHappyVersion()
+  // check returns "matching", and we exit cleanly with no kill action.
+  //
+  // If the start lock cannot be acquired within the window, another invocation
+  // is genuinely stuck - we bail out instead of compounding the problem.
+  const startLockHandle = await acquireRunnerStartLock();
+  if (!startLockHandle) {
+    logger.debug('[RUNNER RUN] Another runner start-sync invocation is in its setup window; bailing out');
+    console.log('Another `hapi runner start-sync` invocation is currently starting up; not racing it');
+    process.exit(0);
+  }
+
+  // From here until we either bail or finish writing runner.state.json with
+  // the runtime lock held, any early `process.exit()` must release the start
+  // lock first. The lock file is also stale-cleaned after 15s (see
+  // acquireRunnerStartLock) so a crashed setup cannot block the next attempt.
+  let startLockReleased = false;
+  const releaseStartLockOnce = async () => {
+    if (startLockReleased) return;
+    startLockReleased = true;
+    await releaseRunnerStartLock(startLockHandle);
+  };
+
   // Check if already running
   // Check if running runner version matches current CLI version
   const runningRunnerVersionMatches = await isRunnerRunningCurrentlyInstalledHappyVersion();
@@ -107,13 +139,15 @@ export async function startRunner(options: { workspaceRoots?: string[] } = {}):
   } else {
     logger.debug('[RUNNER RUN] Runner version matches, keeping existing runner');
     console.log('Runner already running with matching version');
+    await releaseStartLockOnce();
     process.exit(0);
   }
 
   // Acquire exclusive lock (proves runner is running)
   const runnerLockHandle = await acquireRunnerLock(5, 200);
   if (!runnerLockHandle) {
     logger.debug('[RUNNER RUN] Runner lock file already held, another runner is running');
+    await releaseStartLockOnce();
     process.exit(0);
   }
 
@@ -684,6 +718,13 @@ export async function startRunner(options: { workspaceRoots?: string[] } = {}):
     writeRunnerState(fileState);
     logger.debug('[RUNNER RUN] Runner state written');
 
+    // Setup window complete: runtime lock + state file are now owned by us, so
+    // any subsequent start-sync invocation will see "Runner already running with
+    // matching version" and bow out without racing. Release the start lock to
+    // let any queued retries proceed (they will exit cleanly via the version
+    // check). Holding it longer would just delay legitimate next-invocations.
+    await releaseStartLockOnce();
+
     // Prepare initial runner state
     const initialRunnerState: RunnerState = {
       status: 'offline',
@@ -942,6 +983,10 @@ export async function startRunner(options: { workspaceRoots?: string[] } = {}):
       await stopControlServer();
       await cleanupRunnerState();
       await releaseRunnerLock(runnerLockHandle);
+      // Defensive: start-lock should already be released after writeRunnerState,
+      // but if cleanupAndShutdown fires before that point (rare, e.g. SIGTERM
+      // during auth setup) we must not leave it on disk for 15s.
+      await releaseStartLockOnce();
 
       logger.debug('[RUNNER RUN] Cleanup completed, exiting process');
       process.exit(0);
@@ -954,6 +999,9 @@ export async function startRunner(options: { workspaceRoots?: string[] } = {}):
     await cleanupAndShutdown(shutdownRequest.source, shutdownRequest.errorMessage);
   } catch (error) {
     logger.debug('[RUNNER RUN][FATAL] Failed somewhere unexpectedly - exiting with code 1', error);
+    // Best-effort start-lock release on fatal error (don't await in case the
+    // filesystem itself is the problem; stale-cleanup will catch it in 15s).
+    void releaseStartLockOnce().catch(() => { });
     process.exit(1);
   }
 }