Skip to content

Commit cafb91b

Browse files
committed
fix: address PR #198 review feedback from Greptile and Sentry
- Fix circuit breaker race: separate isCircuitOpen check from half-open state transition, call transitionToHalfOpen only after all input validation passes (Sentry) - Fix orphan cleanup error swallowing: inspect tmux kill-session errors, only treat 'session not found' as success, report real failures (Sentry) - Fix bridge recovery gaps: add explicit failure actions when startup script is missing or no live socket UUIDs found (Greptile) - Fix shell injection: use execFileSync for tmux commands instead of execSync with string interpolation (Greptile) - Replace all dynamic require() calls with static ES module imports in both agent-spawn.ts and heartbeat.ts (Greptile) - Replace shell tail command with readFileSync for lifecycle log reading in spawn_status tool (Greptile)
1 parent 43954b6 commit cafb91b

File tree

2 files changed

+92
-43
lines changed

2 files changed

+92
-43
lines changed

pi/extensions/agent-spawn.ts

Lines changed: 18 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
import type { ExtensionAPI } from "@mariozechner/pi-coding-agent";
22
import { Type } from "@sinclair/typebox";
3-
import { appendFileSync, existsSync, mkdirSync, readlinkSync, statSync } from "node:fs";
3+
import { appendFileSync, existsSync, mkdirSync, readFileSync, readlinkSync, statSync } from "node:fs";
44
import net from "node:net";
55
import { homedir } from "node:os";
66
import { dirname, join, resolve } from "node:path";
@@ -63,14 +63,20 @@ function recordFailure(cb: CircuitBreaker): void {
6363

6464
function isCircuitOpen(cb: CircuitBreaker): boolean {
6565
if (cb.state !== "open") return false;
66-
// Check if cooldown has elapsed → transition to half-open
66+
// Check if cooldown has elapsed — eligible for half-open probe
6767
if (cb.lastFailureAt && Date.now() - cb.lastFailureAt >= CIRCUIT_COOLDOWN_MS) {
68-
cb.state = "half-open";
6968
return false;
7069
}
7170
return true;
7271
}
7372

73+
/** Transition to half-open state. Call only after input validation passes. */
74+
function transitionToHalfOpen(cb: CircuitBreaker): void {
75+
if (cb.state === "open" && cb.lastFailureAt && Date.now() - cb.lastFailureAt >= CIRCUIT_COOLDOWN_MS) {
76+
cb.state = "half-open";
77+
}
78+
}
79+
7480
function circuitStatus(cb: CircuitBreaker): string {
7581
const cooldownRemaining =
7682
cb.state === "open" && cb.lastFailureAt
@@ -401,6 +407,10 @@ export default function agentSpawnExtension(pi: ExtensionAPI): void {
401407
};
402408
}
403409

410+
// All validation passed — now safe to transition circuit to half-open
411+
// (allows exactly one probe attempt to test recovery)
412+
transitionToHalfOpen(circuit);
413+
404414
logLifecycleEvent({
405415
timestamp: new Date().toISOString(),
406416
session_name: sessionName,
@@ -542,10 +552,11 @@ export default function agentSpawnExtension(pi: ExtensionAPI): void {
542552
async execute() {
543553
let recentEvents = "";
544554
try {
545-
const { execSync } = require("node:child_process");
546-
const tail = execSync(`tail -20 "${LIFECYCLE_LOG_PATH}" 2>/dev/null`, { encoding: "utf-8" });
547-
if (tail.trim()) {
548-
const lines = tail.trim().split("\n");
555+
if (existsSync(LIFECYCLE_LOG_PATH)) {
556+
const lines = readFileSync(LIFECYCLE_LOG_PATH, "utf-8")
557+
.trimEnd()
558+
.split("\n")
559+
.slice(-20);
549560
recentEvents = lines
550561
.map((line: string) => {
551562
try {

pi/extensions/heartbeat.ts

Lines changed: 74 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,8 @@
2525
import type { ExtensionAPI } from "@mariozechner/pi-coding-agent";
2626
import { Type } from "@sinclair/typebox";
2727
import { StringEnum } from "@mariozechner/pi-ai";
28-
import { appendFileSync, closeSync, existsSync, mkdirSync, openSync, readdirSync, readFileSync, readSync, statSync } from "node:fs";
28+
import { appendFileSync, closeSync, existsSync, mkdirSync, openSync, readdirSync, readFileSync, readlinkSync, readSync, statSync, unlinkSync } from "node:fs";
29+
import { execFileSync, execSync } from "node:child_process";
2930
import { homedir } from "node:os";
3031
import { dirname, join } from "node:path";
3132
import { discoverSubagentPackages, readSubagentState, resolveEffectiveState } from "./subagent-registry.ts";
@@ -132,7 +133,6 @@ function checkSessions(): CheckResult[] {
132133

133134
// Check that the symlink target (.sock file) exists
134135
try {
135-
const { readlinkSync } = require("node:fs");
136136
const target = readlinkSync(aliasPath);
137137
const sockPath = join(SOCKET_DIR, target);
138138
if (!existsSync(sockPath)) {
@@ -353,8 +353,7 @@ function checkUnansweredMentions(): CheckResult[] {
353353
// Support both bridge implementations:
354354
// - broker-bridge.mjs: "... (type: app_mention, ts: 1234.5678)"
355355
// - bridge.mjs: "app_mention ... ts: 1234.5678"
356-
const { execSync } = require("node:child_process");
357-
const logTail = execSync(`tail -500 "${bridgeLogPath}"`, { encoding: "utf-8" });
356+
const logTail = execFileSync("tail", ["-500", bridgeLogPath], { encoding: "utf-8" });
358357

359358
const mentionThreadTsSet = new Set<string>(extractMentionThreadTs(logTail));
360359

@@ -638,7 +637,6 @@ function logRecovery(entry: RecoveryAction): void {
638637
*/
639638
async function tryAutoRecover(failures: CheckResult[]): Promise<RecoveryAction[]> {
640639
const actions: RecoveryAction[] = [];
641-
const { execSync } = require("node:child_process");
642640

643641
for (const failure of failures) {
644642
// Auto-recover: bridge down → restart the bridge tmux session
@@ -659,7 +657,7 @@ async function tryAutoRecover(failures: CheckResult[]): Promise<RecoveryAction[]
659657

660658
// Kill existing bridge tmux session
661659
try {
662-
execSync('tmux kill-session -t baudbot-gateway-bridge 2>/dev/null', { timeout: 5000 });
660+
execFileSync("tmux", ["kill-session", "-t", "baudbot-gateway-bridge"], { timeout: 5000 });
663661
} catch {
664662
// May not exist — that's fine
665663
}
@@ -673,33 +671,56 @@ async function tryAutoRecover(failures: CheckResult[]): Promise<RecoveryAction[]
673671

674672
// Restart via startup script
675673
const startupScript = join(homedir(), ".pi", "agent", "skills", "control-agent", "startup-pi.sh");
676-
if (existsSync(startupScript)) {
677-
// Get live session UUIDs from session-control dir
678-
const sockFiles = readdirSync(SOCKET_DIR).filter((f) => f.endsWith(".sock"));
679-
const uuids = sockFiles.map((f) => f.replace(".sock", "")).join(" ");
680-
if (uuids) {
681-
execSync(`bash "${startupScript}" ${uuids} 2>&1`, {
682-
timeout: 30000,
683-
encoding: "utf-8",
684-
});
685-
686-
// Verify bridge came back
687-
await new Promise((resolve) => setTimeout(resolve, 3000));
688-
const verifyResult = await checkBridge();
674+
if (!existsSync(startupScript)) {
675+
const entry: RecoveryAction = {
676+
timestamp: new Date().toISOString(),
677+
check: failure.name,
678+
action: "bridge_restart",
679+
success: false,
680+
detail: `Bridge session killed but startup script not found at ${startupScript} — cannot restart`,
681+
};
682+
actions.push(entry);
683+
logRecovery(entry);
684+
continue;
685+
}
689686

690-
const entry: RecoveryAction = {
691-
timestamp: new Date().toISOString(),
692-
check: failure.name,
693-
action: "bridge_restart",
694-
success: verifyResult.ok,
695-
detail: verifyResult.ok
696-
? "Bridge restarted and verified healthy"
697-
: `Bridge restart attempted but still failing: ${verifyResult.detail}`,
698-
};
699-
actions.push(entry);
700-
logRecovery(entry);
701-
}
687+
// Get live session UUIDs from session-control dir
688+
const sockFiles = readdirSync(SOCKET_DIR).filter((f) => f.endsWith(".sock"));
689+
const uuids = sockFiles.map((f) => f.replace(".sock", ""));
690+
if (uuids.length === 0) {
691+
const entry: RecoveryAction = {
692+
timestamp: new Date().toISOString(),
693+
check: failure.name,
694+
action: "bridge_restart",
695+
success: false,
696+
detail: "Bridge session killed but no live socket UUIDs found — cannot restart",
697+
};
698+
actions.push(entry);
699+
logRecovery(entry);
700+
continue;
702701
}
702+
703+
// Pass UUIDs as separate args to avoid shell injection
704+
execFileSync("bash", [startupScript, ...uuids], {
705+
timeout: 30000,
706+
encoding: "utf-8",
707+
});
708+
709+
// Verify bridge came back
710+
await new Promise((resolve) => setTimeout(resolve, 3000));
711+
const verifyResult = await checkBridge();
712+
713+
const entry: RecoveryAction = {
714+
timestamp: new Date().toISOString(),
715+
check: failure.name,
716+
action: "bridge_restart",
717+
success: verifyResult.ok,
718+
detail: verifyResult.ok
719+
? "Bridge restarted and verified healthy"
720+
: `Bridge restart attempted but still failing: ${verifyResult.detail}`,
721+
};
722+
actions.push(entry);
723+
logRecovery(entry);
703724
} catch (err: any) {
704725
const entry: RecoveryAction = {
705726
timestamp: new Date().toISOString(),
@@ -717,17 +738,34 @@ async function tryAutoRecover(failures: CheckResult[]): Promise<RecoveryAction[]
717738
if (failure.name.startsWith("orphan:")) {
718739
const sessionName = failure.name.replace("orphan:", "");
719740
try {
720-
// Kill the tmux session
741+
// Kill the tmux session (use execFileSync to avoid shell injection)
742+
let tmuxKilled = false;
721743
try {
722-
execSync(`tmux kill-session -t "${sessionName}" 2>/dev/null`, { timeout: 5000 });
723-
} catch {
724-
// May already be dead
744+
execFileSync("tmux", ["kill-session", "-t", sessionName], { timeout: 5000 });
745+
tmuxKilled = true;
746+
} catch (killErr: any) {
747+
// "session not found" means it's already dead — that's fine
748+
const msg = killErr.message || String(killErr);
749+
if (msg.includes("session not found") || msg.includes("can't find session")) {
750+
tmuxKilled = true; // Already gone — counts as success
751+
} else {
752+
// Real error (tmux daemon down, permissions, timeout, etc.)
753+
const entry: RecoveryAction = {
754+
timestamp: new Date().toISOString(),
755+
check: failure.name,
756+
action: "orphan_cleanup",
757+
success: false,
758+
detail: `Failed to kill tmux session "${sessionName}": ${msg}`,
759+
};
760+
actions.push(entry);
761+
logRecovery(entry);
762+
continue;
763+
}
725764
}
726765

727766
// Remove the stale alias
728767
const aliasPath = join(SOCKET_DIR, `${sessionName}.alias`);
729768
if (existsSync(aliasPath)) {
730-
const { unlinkSync } = require("node:fs");
731769
unlinkSync(aliasPath);
732770
}
733771

0 commit comments

Comments
 (0)