From ed48bf850bd96ed914a90286faa7d2fb3f2b3952 Mon Sep 17 00:00:00 2001 From: harjoth Date: Tue, 2 Jun 2026 21:47:39 -0700 Subject: [PATCH] fix(browse): recover a late-healthy detached daemon instead of a false "failed to start" (#1846) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit startServer polls /health for MAX_START_WAIT, then throws "Server failed to start within Ns" on timeout. But the server is spawned detached + unref'd, so it keeps booting independently of the CLI's poll loop. On a loaded machine the daemon can become healthy in the gap between the loop's final tick and the throw — the very next `browse status` shows it listening and healthy. The probe timed out; the launch did not. - Add a final readState() + isServerHealthy() re-check before the timeout throw, mirroring the post-loop recovery already in ensureServer(). A genuinely failed server stays unhealthy and falls through to the existing error report. - Make the startup budget overridable via BROWSE_START_TIMEOUT (ms) through a new pure, exported resolveStartTimeout() — matching the BROWSE_* tunable convention in server.ts — so hosts where even the platform ceiling is too low have an escape hatch. Complements #1732 (which only raised the constant): this removes the false-negative at the throw site itself, on any platform/load. Test: browse/test/cli-start-final-healthcheck.test.ts — behavioral coverage of resolveStartTimeout overrides + a static invariant that the final re-check sits between the poll loop and the throw. Co-Authored-By: Claude Opus 4.8 --- browse/src/cli.ts | 32 +++++++- .../test/cli-start-final-healthcheck.test.ts | 77 +++++++++++++++++++ 2 files changed, 108 insertions(+), 1 deletion(-) create mode 100644 browse/test/cli-start-final-healthcheck.test.ts diff --git a/browse/src/cli.ts b/browse/src/cli.ts index 59327b7923..d9424e1e18 100644 --- a/browse/src/cli.ts +++ b/browse/src/cli.ts @@ -21,7 +21,26 @@ import { spawnTerminalAgent } from './terminal-agent-control'; const config = resolveConfig(); const IS_WINDOWS = process.platform === 'win32'; -const MAX_START_WAIT = IS_WINDOWS ? 15000 : (process.env.CI ? 30000 : 8000); // Node+Chromium takes longer on Windows + +/** + * Startup health-probe budget (ms) for a freshly spawned server. The daemon is + * detached + unref'd, so it keeps booting regardless of how long the CLI is + * willing to poll — this constant only bounds how long `startServer` waits + * before reporting failure. + * + * Overridable via `BROWSE_START_TIMEOUT` (ms) for hosts where even the platform + * ceiling isn't enough — e.g. Windows under heavy load (#1846), where the 15s + * budget can still elapse before a busy box finishes booting Node+Chromium. + * Mirrors the `BROWSE_*` tunable convention used throughout server.ts + * (BROWSE_PORT, BROWSE_IDLE_TIMEOUT, ...). A non-positive or unparseable value + * falls back to the platform default. Pure + exported for tests. + */ +export function resolveStartTimeout(env: NodeJS.ProcessEnv = process.env): number { + const platformDefault = IS_WINDOWS ? 15000 : (env.CI ? 30000 : 8000); // Node+Chromium takes longer on Windows + const override = parseInt(env.BROWSE_START_TIMEOUT || '', 10); + return Number.isFinite(override) && override > 0 ? override : platformDefault; +} +const MAX_START_WAIT = resolveStartTimeout(); export function resolveServerScript( env: Record = process.env, @@ -357,6 +376,17 @@ async function startServer(extraEnv?: Record): Promise fs.readFileSync(CLI, 'utf-8'); + +describe('#1846 startServer false-negative on a late-healthy detached daemon', () => { + test('a final health re-check sits between the poll loop and the timeout throw', () => { + const src = read(); + const throwIdx = src.indexOf('Server failed to start within'); + expect(throwIdx).toBeGreaterThan(-1); + + // The startServer poll loop ends at its `await Bun.sleep(100)`; the final + // re-check must live AFTER that loop and BEFORE the timeout throw. + const loopEnd = src.lastIndexOf('await Bun.sleep(100)', throwIdx); + expect(loopEnd).toBeGreaterThan(-1); + const between = src.slice(loopEnd, throwIdx); + + // It must re-read state and re-probe health, then be able to return — i.e. + // a genuine recovery path, not just a comment. + expect(between).toContain('readState()'); + expect(between).toMatch(/isServerHealthy\([^)]*\)/); + expect(between).toMatch(/return\s+\w+;/); + }); + + test('the re-check returns the recovered state rather than swallowing it', () => { + const src = read(); + // Guard against a refactor that probes health but forgets to return the + // state (which would re-introduce the false negative). + expect(src).toMatch(/if\s*\([^)]*await\s+isServerHealthy\([^)]*\)\)\s*\{\s*return\s+\w+;/); + }); +}); + +describe('#1846 BROWSE_START_TIMEOUT env override (resolveStartTimeout)', () => { + const platformDefault = resolveStartTimeout({} as NodeJS.ProcessEnv); + + test('platform default is a positive millisecond budget when unset', () => { + expect(platformDefault).toBeGreaterThan(0); + }); + + test('honors a positive BROWSE_START_TIMEOUT override', () => { + expect(resolveStartTimeout({ BROWSE_START_TIMEOUT: '42000' } as NodeJS.ProcessEnv)).toBe(42000); + }); + + test('falls back to the platform default for non-positive / unparseable values', () => { + for (const bad of ['0', '-5', 'abc', '', ' ']) { + expect(resolveStartTimeout({ BROWSE_START_TIMEOUT: bad } as NodeJS.ProcessEnv)).toBe(platformDefault); + } + }); + + test('MAX_START_WAIT is wired through resolveStartTimeout (no stray hardcoded constant)', () => { + const src = read(); + expect(src).toMatch(/const\s+MAX_START_WAIT\s*=\s*resolveStartTimeout\(\)/); + }); +});