Skip to content

Commit 49e1675

Browse files
authored
Fix desktop health flaps (#1028)
1 parent dcb121d commit 49e1675

4 files changed

Lines changed: 115 additions & 64 deletions

File tree

apps/desktop/src/main/diagnostics.ts

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -181,6 +181,7 @@ const exportStamp = () =>
181181

182182
const buildManifest = () => {
183183
const settings = getServerSettings();
184+
const executorLogs = join(app.getPath("home"), ".executor", "logs");
184185
return {
185186
generated: new Date().toISOString(),
186187
app: app.getName(),
@@ -195,6 +196,7 @@ const buildManifest = () => {
195196
paths: {
196197
userData: app.getPath("userData"),
197198
logs: dirname(log.transports.file.getFile().path),
199+
executorLogs,
198200
crashDumps: app.getPath("crashDumps"),
199201
},
200202
// The bearer token is never included — it stays in auth.json on the machine.
@@ -215,8 +217,10 @@ export const exportDiagnostics = async (): Promise<string> => {
215217
const { readFile } = await import("node:fs/promises");
216218

217219
const logsDir = dirname(log.transports.file.getFile().path);
220+
const executorLogsDir = join(app.getPath("home"), ".executor", "logs");
218221
const entries: ZipEntry[] = [
219222
...collectFiles(logsDir, "logs"),
223+
...collectFiles(executorLogsDir, "executor-logs"),
220224
...collectFiles(app.getPath("crashDumps"), "crash-dumps"),
221225
];
222226

apps/desktop/src/main/index.ts

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -257,11 +257,14 @@ const ensureSupervisedConnection = async (): Promise<SidecarConnection | null> =
257257
// reconnecting overlay while it's down, and reload once it's back.
258258
let supervisedMonitorTimer: ReturnType<typeof setInterval> | null = null;
259259
let supervisedDaemonDown = false;
260+
let supervisedMonitorMisses = 0;
261+
const SUPERVISED_MONITOR_MISSES_BEFORE_DOWN = 3;
260262

261263
const stopSupervisedMonitor = () => {
262264
if (supervisedMonitorTimer) clearInterval(supervisedMonitorTimer);
263265
supervisedMonitorTimer = null;
264266
supervisedDaemonDown = false;
267+
supervisedMonitorMisses = 0;
265268
};
266269

267270
const armSupervisedMonitor = () => {
@@ -271,13 +274,16 @@ const armSupervisedMonitor = () => {
271274
const live = await attachToSupervisedDaemon();
272275
const window = liveMainWindow();
273276
if (!live) {
277+
supervisedMonitorMisses += 1;
278+
if (supervisedMonitorMisses < SUPERVISED_MONITOR_MISSES_BEFORE_DOWN) return;
274279
if (!supervisedDaemonDown && window) {
275280
supervisedDaemonDown = true;
276281
const html = sidecarCrashHtml({ reported: errorReportingEnabled });
277282
void window.loadURL(`data:text/html;charset=utf-8,${encodeURIComponent(html)}`);
278283
}
279284
return;
280285
}
286+
supervisedMonitorMisses = 0;
281287
if (supervisedDaemonDown) {
282288
supervisedDaemonDown = false;
283289
connection = live;

apps/desktop/src/main/sidecar.ts

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -468,7 +468,7 @@ const isDaemonReachable = async (origin: string): Promise<boolean> => {
468468
* sidecar. Reads `server.json`, confirms the recorded process is alive and the
469469
* endpoint answers, and returns a child-less `SidecarConnection` flagged
470470
* `supervisedDaemon: true`. Returns null when no usable supervised daemon is
471-
* present (the caller then falls back to managed-spawn).
471+
* present.
472472
*
473473
* Only a `cli-daemon` manifest is treated as supervised — a `desktop-sidecar`
474474
* manifest belongs to a managed sidecar (ours or another desktop instance) and
@@ -487,8 +487,9 @@ export async function attachToSupervisedDaemon(): Promise<SidecarConnection | nu
487487
const auth = manifest.connection.auth;
488488
const authToken = auth && auth.kind === "bearer" ? auth.token : "";
489489
if (!(await isDaemonReachable(origin))) {
490-
sidecarLog.info(`removing stale supervised daemon manifest for unreachable ${origin}`);
491-
removeManifestIfOwnedBy(dataDir, manifest.pid);
490+
sidecarLog.warn(
491+
`supervised daemon at ${origin} (pid ${manifest.pid}) did not answer the health probe; keeping its manifest because the process is still alive`,
492+
);
492493
return null;
493494
}
494495

e2e/desktop-packaged/supervised-regressions.test.ts

Lines changed: 101 additions & 61 deletions
Original file line numberDiff line numberDiff line change
@@ -489,15 +489,6 @@ const closePackaged = async (app: PackagedApp | undefined): Promise<void> => {
489489
await stopProcess(app?.child);
490490
};
491491

492-
const waitUntil = async (predicate: () => boolean, timeoutMs: number): Promise<boolean> => {
493-
const deadline = Date.now() + timeoutMs;
494-
for (;;) {
495-
if (predicate()) return true;
496-
if (Date.now() >= deadline) return false;
497-
await new Promise((resolve) => setTimeout(resolve, 100));
498-
}
499-
};
500-
501492
const waitForServerConnectionLabel = async (
502493
page: CdpPage,
503494
expectedText: string,
@@ -661,9 +652,12 @@ if (!guiAvailable() || !packagedSingleInstanceAvailable()) {
661652
it.skip("Desktop packaged supervised attach security (needs a GUI display and no already-running Executor.app)", () => {});
662653
} else {
663654
scenario(
664-
"Desktop packaged supervised attach · stale manifest probe does not send the saved bearer",
655+
"Desktop packaged supervised attach · a slow live daemon does not look crashed",
665656
{ timeout: 240_000 },
666-
Effect.promise(() => runStaleManifestProbe()),
657+
Effect.gen(function* () {
658+
const runDir = yield* RunDir;
659+
yield* Effect.promise(() => runSlowLiveDaemonProbe(runDir));
660+
}),
667661
);
668662

669663
scenario(
@@ -685,81 +679,127 @@ if (!guiAvailable() || !packagedSingleInstanceAvailable()) {
685679
);
686680
}
687681

688-
const runStaleManifestProbe = async () => {
689-
const home = mkdtempSync(join(tmpdir(), "executor-pkg-stale-probe-"));
682+
const writeCliDaemonManifest = (input: {
683+
readonly controlDir: string;
684+
readonly dataDir: string;
685+
readonly origin: string;
686+
readonly displayName: string;
687+
readonly token: string;
688+
}): void => {
689+
mkdirSync(input.controlDir, { recursive: true });
690+
writeFileSync(
691+
join(input.controlDir, "server.json"),
692+
serializeExecutorLocalServerManifest({
693+
version: 1,
694+
kind: "cli-daemon",
695+
pid: process.pid,
696+
startedAt: new Date().toISOString(),
697+
dataDir: input.dataDir,
698+
scopeDir: input.dataDir,
699+
connection: normalizeExecutorServerConnection({
700+
origin: input.origin,
701+
displayName: input.displayName,
702+
auth: { kind: "bearer", token: input.token },
703+
}),
704+
owner: { client: "cli", version: null, executablePath: null },
705+
}),
706+
{ mode: 0o600 },
707+
);
708+
};
709+
710+
const runSlowLiveDaemonProbe = async (runDir: string) => {
711+
const home = mkdtempSync(join(tmpdir(), "executor-pkg-slow-live-daemon-"));
690712
const dataDir = join(home, ".executor");
691713
const controlDir = join(dataDir, "server-control");
692714
const manifestPath = join(controlDir, "server.json");
693-
const token = "stale-manifest-leaked-token";
715+
const token = "slow-live-daemon-token";
694716
const launchdSnapshot = captureLaunchdService();
695717
const requests: Array<{ readonly url: string; readonly authorization: string | null }> = [];
696-
let resolveFirst!: () => void;
697-
const firstRequest = new Promise<void>((resolve) => {
698-
resolveFirst = resolve;
718+
let healthMode: "ok" | "slow" = "ok";
719+
let slowHealthResponses = 0;
720+
let resolveThirdSlowHealthResponse!: () => void;
721+
const thirdSlowHealthResponse = new Promise<void>((resolve) => {
722+
resolveThirdSlowHealthResponse = resolve;
699723
});
700724
const server = createServer((req: IncomingMessage, res) => {
725+
const url = req.url ?? "/";
701726
requests.push({
702-
url: req.url ?? "/",
727+
url,
703728
authorization: req.headers.authorization ?? null,
704729
});
705-
resolveFirst();
730+
if (url.startsWith("/api/health")) {
731+
if (healthMode === "slow") {
732+
setTimeout(() => {
733+
slowHealthResponses += 1;
734+
if (slowHealthResponses >= 3) resolveThirdSlowHealthResponse();
735+
if (res.destroyed || res.writableEnded) return;
736+
res.writeHead(200, { "content-type": "text/plain" });
737+
res.end("ok");
738+
}, 5_000);
739+
return;
740+
}
741+
res.writeHead(200, { "content-type": "text/plain" });
742+
res.end("ok");
743+
return;
744+
}
706745
res.writeHead(200, { "content-type": "text/html" });
707-
res.end("<!doctype html><title>stale daemon</title><body>stale daemon</body>");
746+
res.end("<!doctype html><title>Executor</title><body><main>Fake Executor UI</main></body>");
708747
});
709748
await new Promise<void>((resolve) => server.listen(0, "127.0.0.1", resolve));
710749
const port = (server.address() as net.AddressInfo).port;
711-
let appProcess: ChildProcess | undefined;
712-
let appOutput = "";
750+
let app: PackagedApp | undefined;
713751

714752
try {
715-
mkdirSync(controlDir, { recursive: true });
716-
writeFileSync(
717-
join(controlDir, "server.json"),
718-
serializeExecutorLocalServerManifest({
719-
version: 1,
720-
kind: "cli-daemon",
721-
pid: process.pid,
722-
startedAt: new Date().toISOString(),
723-
dataDir,
724-
scopeDir: dataDir,
725-
connection: normalizeExecutorServerConnection({
726-
origin: `http://127.0.0.1:${port}`,
727-
displayName: "Stale daemon",
728-
auth: { kind: "bearer", token },
729-
}),
730-
owner: { client: "cli", version: null, executablePath: null },
731-
}),
732-
{ mode: 0o600 },
733-
);
734-
735-
appProcess = spawn(requireBundle().app, [], {
736-
env: packagedAppEnv(home),
737-
stdio: ["ignore", "pipe", "pipe"],
753+
writeCliDaemonManifest({
754+
controlDir,
755+
dataDir,
756+
origin: `http://127.0.0.1:${port}`,
757+
displayName: "Slow live daemon",
758+
token,
738759
});
739-
const collectOutput = (chunk: Buffer) => {
740-
appOutput = (appOutput + chunk.toString()).slice(-8_192);
741-
};
742-
appProcess.stdout?.on("data", collectOutput);
743-
appProcess.stderr?.on("data", collectOutput);
744760

745-
const probed = await Promise.race([
746-
firstRequest.then(() => true),
747-
new Promise<false>((resolve) => setTimeout(() => resolve(false), 60_000)),
748-
]);
749-
expect(probed, `packaged app probed the stale manifest endpoint\n${appOutput}`).toBe(true);
761+
app = await launchPackaged(home);
762+
const page = app.cdp;
763+
await page.waitForText("Fake Executor UI", 120_000);
764+
await page.screenshot(join(runDir, "01-attached-to-fake-daemon.png"));
750765

766+
const firstHealthProbe = requests.find((request) => request.url.startsWith("/api/health"));
751767
expect(
752-
requests[0]?.authorization ?? null,
753-
"the stale-manifest reachability probe must not disclose the saved bearer",
768+
firstHealthProbe?.authorization ?? null,
769+
"the supervised-daemon health probe must not disclose the saved bearer",
754770
).toBeNull();
755771

756-
const manifestRemoved = await waitUntil(() => !existsSync(manifestPath), 15_000);
772+
healthMode = "slow";
773+
const sawSlowMonitorProbe = await Promise.race([
774+
thirdSlowHealthResponse.then(() => true),
775+
new Promise<false>((resolve) => setTimeout(() => resolve(false), 60_000)),
776+
]);
777+
expect(
778+
sawSlowMonitorProbe,
779+
"the packaged app should continue probing the supervised daemon after initial attach",
780+
).toBe(true);
781+
782+
expect(
783+
requests
784+
.filter((request) => request.url.startsWith("/api/health"))
785+
.map((request) => request.authorization),
786+
"supervised-daemon health probes must never carry the saved bearer",
787+
).not.toContain(`Bearer ${token}`);
757788
expect(
758-
manifestRemoved,
759-
"a live pid with a failed health probe must be removed before desktop falls back",
789+
existsSync(manifestPath),
790+
"a live daemon's manifest must survive transient health probe failures",
760791
).toBe(true);
792+
expect(
793+
await page.textPresent("The local Executor server stopped unexpectedly"),
794+
"one slow monitor probe should not show the crash screen",
795+
).toBe(false);
796+
expect(
797+
await page.textPresent("Fake Executor UI"),
798+
"the original renderer should stay loaded",
799+
).toBe(true);
800+
await page.screenshot(join(runDir, "02-still-rendering-after-slow-health.png"));
761801
} finally {
762-
await stopProcess(appProcess);
802+
await closePackaged(app);
763803
await restoreLaunchdService(launchdSnapshot);
764804
await new Promise<void>((resolve) => server.close(() => resolve()));
765805
rmSync(home, { recursive: true, force: true });

0 commit comments

Comments
 (0)