Skip to content

Commit 932bbd7

Browse files
committed
fix: address PR #198 review feedback from Greptile and Sentry
- Fix circuit breaker race: separate isCircuitOpen check from half-open state transition, call transitionToHalfOpen only after all input validation passes (Sentry) - Fix orphan cleanup error swallowing: inspect tmux kill-session errors, only treat 'session not found' as success, report real failures (Sentry) - Fix bridge recovery gaps: add explicit failure actions when startup script is missing or no live socket UUIDs found (Greptile) - Fix shell injection: use execFileSync for tmux commands instead of execSync with string interpolation (Greptile) - Replace all dynamic require() calls with static ES module imports in both agent-spawn.ts and heartbeat.ts (Greptile) - Replace shell tail command with readFileSync for lifecycle log reading in spawn_status tool (Greptile)
1 parent 760d1e3 commit 932bbd7

File tree

2 files changed

+92
-43
lines changed

2 files changed

+92
-43
lines changed

pi/extensions/agent-spawn.ts

Lines changed: 18 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
import type { ExtensionAPI } from "@mariozechner/pi-coding-agent";
22
import { Type } from "@sinclair/typebox";
3-
import { appendFileSync, existsSync, mkdirSync, readlinkSync, statSync } from "node:fs";
3+
import { appendFileSync, existsSync, mkdirSync, readFileSync, readlinkSync, statSync } from "node:fs";
44
import net from "node:net";
55
import { homedir } from "node:os";
66
import { dirname, join, resolve } from "node:path";
@@ -63,14 +63,20 @@ function recordFailure(cb: CircuitBreaker): void {
6363

6464
function isCircuitOpen(cb: CircuitBreaker): boolean {
6565
if (cb.state !== "open") return false;
66-
// Check if cooldown has elapsed → transition to half-open
66+
// Check if cooldown has elapsed — eligible for half-open probe
6767
if (cb.lastFailureAt && Date.now() - cb.lastFailureAt >= CIRCUIT_COOLDOWN_MS) {
68-
cb.state = "half-open";
6968
return false;
7069
}
7170
return true;
7271
}
7372

73+
/** Transition to half-open state. Call only after input validation passes. */
74+
function transitionToHalfOpen(cb: CircuitBreaker): void {
75+
if (cb.state === "open" && cb.lastFailureAt && Date.now() - cb.lastFailureAt >= CIRCUIT_COOLDOWN_MS) {
76+
cb.state = "half-open";
77+
}
78+
}
79+
7480
function circuitStatus(cb: CircuitBreaker): string {
7581
const cooldownRemaining =
7682
cb.state === "open" && cb.lastFailureAt
@@ -401,6 +407,10 @@ export default function agentSpawnExtension(pi: ExtensionAPI): void {
401407
};
402408
}
403409

410+
// All validation passed — now safe to transition circuit to half-open
411+
// (allows exactly one probe attempt to test recovery)
412+
transitionToHalfOpen(circuit);
413+
404414
logLifecycleEvent({
405415
timestamp: new Date().toISOString(),
406416
session_name: sessionName,
@@ -542,10 +552,11 @@ export default function agentSpawnExtension(pi: ExtensionAPI): void {
542552
async execute() {
543553
let recentEvents = "";
544554
try {
545-
const { execSync } = require("node:child_process");
546-
const tail = execSync(`tail -20 "${LIFECYCLE_LOG_PATH}" 2>/dev/null`, { encoding: "utf-8" });
547-
if (tail.trim()) {
548-
const lines = tail.trim().split("\n");
555+
if (existsSync(LIFECYCLE_LOG_PATH)) {
556+
const lines = readFileSync(LIFECYCLE_LOG_PATH, "utf-8")
557+
.trimEnd()
558+
.split("\n")
559+
.slice(-20);
549560
recentEvents = lines
550561
.map((line: string) => {
551562
try {

pi/extensions/heartbeat.ts

Lines changed: 74 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,8 @@
2525
import type { ExtensionAPI } from "@mariozechner/pi-coding-agent";
2626
import { Type } from "@sinclair/typebox";
2727
import { StringEnum } from "@mariozechner/pi-ai";
28-
import { appendFileSync, existsSync, mkdirSync, readdirSync, readFileSync, statSync } from "node:fs";
28+
import { appendFileSync, existsSync, mkdirSync, readdirSync, readFileSync, readlinkSync, statSync, unlinkSync } from "node:fs";
29+
import { execFileSync, execSync } from "node:child_process";
2930
import { homedir } from "node:os";
3031
import { dirname, join } from "node:path";
3132
import { discoverSubagentPackages, readSubagentState, resolveEffectiveState } from "./subagent-registry.ts";
@@ -127,7 +128,6 @@ function checkSessions(): CheckResult[] {
127128

128129
// Check that the symlink target (.sock file) exists
129130
try {
130-
const { readlinkSync } = require("node:fs");
131131
const target = readlinkSync(aliasPath);
132132
const sockPath = join(SOCKET_DIR, target);
133133
if (!existsSync(sockPath)) {
@@ -348,8 +348,7 @@ function checkUnansweredMentions(): CheckResult[] {
348348
// Support both bridge implementations:
349349
// - broker-bridge.mjs: "... (type: app_mention, ts: 1234.5678)"
350350
// - bridge.mjs: "app_mention ... ts: 1234.5678"
351-
const { execSync } = require("node:child_process");
352-
const logTail = execSync(`tail -500 "${bridgeLogPath}"`, { encoding: "utf-8" });
351+
const logTail = execFileSync("tail", ["-500", bridgeLogPath], { encoding: "utf-8" });
353352

354353
const mentionThreadTsSet = new Set<string>(extractMentionThreadTs(logTail));
355354

@@ -592,7 +591,6 @@ function logRecovery(entry: RecoveryAction): void {
592591
*/
593592
async function tryAutoRecover(failures: CheckResult[]): Promise<RecoveryAction[]> {
594593
const actions: RecoveryAction[] = [];
595-
const { execSync } = require("node:child_process");
596594

597595
for (const failure of failures) {
598596
// Auto-recover: bridge down → restart the bridge tmux session
@@ -613,7 +611,7 @@ async function tryAutoRecover(failures: CheckResult[]): Promise<RecoveryAction[]
613611

614612
// Kill existing bridge tmux session
615613
try {
616-
execSync('tmux kill-session -t baudbot-gateway-bridge 2>/dev/null', { timeout: 5000 });
614+
execFileSync("tmux", ["kill-session", "-t", "baudbot-gateway-bridge"], { timeout: 5000 });
617615
} catch {
618616
// May not exist — that's fine
619617
}
@@ -627,33 +625,56 @@ async function tryAutoRecover(failures: CheckResult[]): Promise<RecoveryAction[]
627625

628626
// Restart via startup script
629627
const startupScript = join(homedir(), ".pi", "agent", "skills", "control-agent", "startup-pi.sh");
630-
if (existsSync(startupScript)) {
631-
// Get live session UUIDs from session-control dir
632-
const sockFiles = readdirSync(SOCKET_DIR).filter((f) => f.endsWith(".sock"));
633-
const uuids = sockFiles.map((f) => f.replace(".sock", "")).join(" ");
634-
if (uuids) {
635-
execSync(`bash "${startupScript}" ${uuids} 2>&1`, {
636-
timeout: 30000,
637-
encoding: "utf-8",
638-
});
639-
640-
// Verify bridge came back
641-
await new Promise((resolve) => setTimeout(resolve, 3000));
642-
const verifyResult = await checkBridge();
628+
if (!existsSync(startupScript)) {
629+
const entry: RecoveryAction = {
630+
timestamp: new Date().toISOString(),
631+
check: failure.name,
632+
action: "bridge_restart",
633+
success: false,
634+
detail: `Bridge session killed but startup script not found at ${startupScript} — cannot restart`,
635+
};
636+
actions.push(entry);
637+
logRecovery(entry);
638+
continue;
639+
}
643640

644-
const entry: RecoveryAction = {
645-
timestamp: new Date().toISOString(),
646-
check: failure.name,
647-
action: "bridge_restart",
648-
success: verifyResult.ok,
649-
detail: verifyResult.ok
650-
? "Bridge restarted and verified healthy"
651-
: `Bridge restart attempted but still failing: ${verifyResult.detail}`,
652-
};
653-
actions.push(entry);
654-
logRecovery(entry);
655-
}
641+
// Get live session UUIDs from session-control dir
642+
const sockFiles = readdirSync(SOCKET_DIR).filter((f) => f.endsWith(".sock"));
643+
const uuids = sockFiles.map((f) => f.replace(".sock", ""));
644+
if (uuids.length === 0) {
645+
const entry: RecoveryAction = {
646+
timestamp: new Date().toISOString(),
647+
check: failure.name,
648+
action: "bridge_restart",
649+
success: false,
650+
detail: "Bridge session killed but no live socket UUIDs found — cannot restart",
651+
};
652+
actions.push(entry);
653+
logRecovery(entry);
654+
continue;
656655
}
656+
657+
// Pass UUIDs as separate args to avoid shell injection
658+
execFileSync("bash", [startupScript, ...uuids], {
659+
timeout: 30000,
660+
encoding: "utf-8",
661+
});
662+
663+
// Verify bridge came back
664+
await new Promise((resolve) => setTimeout(resolve, 3000));
665+
const verifyResult = await checkBridge();
666+
667+
const entry: RecoveryAction = {
668+
timestamp: new Date().toISOString(),
669+
check: failure.name,
670+
action: "bridge_restart",
671+
success: verifyResult.ok,
672+
detail: verifyResult.ok
673+
? "Bridge restarted and verified healthy"
674+
: `Bridge restart attempted but still failing: ${verifyResult.detail}`,
675+
};
676+
actions.push(entry);
677+
logRecovery(entry);
657678
} catch (err: any) {
658679
const entry: RecoveryAction = {
659680
timestamp: new Date().toISOString(),
@@ -671,17 +692,34 @@ async function tryAutoRecover(failures: CheckResult[]): Promise<RecoveryAction[]
671692
if (failure.name.startsWith("orphan:")) {
672693
const sessionName = failure.name.replace("orphan:", "");
673694
try {
674-
// Kill the tmux session
695+
// Kill the tmux session (use execFileSync to avoid shell injection)
696+
let tmuxKilled = false;
675697
try {
676-
execSync(`tmux kill-session -t "${sessionName}" 2>/dev/null`, { timeout: 5000 });
677-
} catch {
678-
// May already be dead
698+
execFileSync("tmux", ["kill-session", "-t", sessionName], { timeout: 5000 });
699+
tmuxKilled = true;
700+
} catch (killErr: any) {
701+
// "session not found" means it's already dead — that's fine
702+
const msg = killErr.message || String(killErr);
703+
if (msg.includes("session not found") || msg.includes("can't find session")) {
704+
tmuxKilled = true; // Already gone — counts as success
705+
} else {
706+
// Real error (tmux daemon down, permissions, timeout, etc.)
707+
const entry: RecoveryAction = {
708+
timestamp: new Date().toISOString(),
709+
check: failure.name,
710+
action: "orphan_cleanup",
711+
success: false,
712+
detail: `Failed to kill tmux session "${sessionName}": ${msg}`,
713+
};
714+
actions.push(entry);
715+
logRecovery(entry);
716+
continue;
717+
}
679718
}
680719

681720
// Remove the stale alias
682721
const aliasPath = join(SOCKET_DIR, `${sessionName}.alias`);
683722
if (existsSync(aliasPath)) {
684-
const { unlinkSync } = require("node:fs");
685723
unlinkSync(aliasPath);
686724
}
687725

0 commit comments

Comments
 (0)