Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 17 additions & 1 deletion packages/server/src/services/hermes/agent-bridge/client.ts
Original file line number Diff line number Diff line change
Expand Up @@ -430,7 +430,23 @@ export class AgentBridgeClient {
let cursor = 0
let eventCursor = 0
for (;;) {
const chunk = await this.getOutput(runId, cursor, eventCursor, options)
let chunk: AgentBridgeOutput
try {
chunk = await this.getOutput(runId, cursor, eventCursor, options)
} catch (err) {
// The bridge worker keeps run state in-memory only — if it has been
// restarted (deploy, OOM, crash) any in-flight run becomes unknown.
// Surface a typed error so callers can handle gracefully instead of
// looping or showing the raw KeyError to users.
const message = err instanceof Error ? err.message : String(err ?? '')
if (/unknown run\b/i.test(message)) {
const restartErr: any = new Error(`bridge worker restarted; run ${runId} no longer tracked`)
restartErr.code = 'BRIDGE_RUN_LOST'
restartErr.runId = runId
throw restartErr
Comment on lines +443 to +446
}
throw err
}
cursor = chunk.cursor
eventCursor = chunk.event_cursor
if (chunk.delta || chunk.done || (chunk.events && chunk.events.length > 0)) yield chunk
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,22 @@ function looksLikeAgentFailure(value: string): boolean {
|| /\b(?:401|403|429|500|502|503|504)\b/.test(value) && /\b(?:unauthorized|forbidden|rate limit|unavailable|failed|error)\b/i.test(value)
}

/**
* Detect "unknown run" errors raised when the agent bridge worker has been
* restarted (e.g. systemd unit restart, OOM, crash recovery). The Python
* bridge keeps run state purely in-memory, so any subsequent `get_output` /
* `get_result` poll on a run started before the restart returns
* `unknown run: <id>`. We translate this to a friendly message and treat
* the run as cleanly terminated so the UI can recover.
*/
export function isBridgeUnknownRunError(err: unknown): boolean {
const message = err instanceof Error ? err.message : String(err ?? '')
return /unknown run\b/i.test(message)
}

const BRIDGE_RESTART_USER_MESSAGE =
'A conexão com o agente foi reiniciada (geralmente por um deploy ou reinicialização do serviço). Sua mensagem foi salva — basta enviar de novo para continuar.'

export function bridgeTerminalError(chunk: Pick<AgentBridgeOutput, 'status' | 'error' | 'result'>): string | null {
const result = chunk.result && typeof chunk.result === 'object' && !Array.isArray(chunk.result)
? chunk.result as Record<string, unknown>
Expand Down Expand Up @@ -309,6 +325,7 @@ export async function handleBridgeRun(
if (state.activeRunMarker !== runMarker) return
if (!state.isWorking) return
const queueLen = state.queue?.length ?? 0
const bridgeRestarted = isBridgeUnknownRunError(err)
state.isWorking = false
state.isAborting = false
state.profile = undefined
Expand All @@ -318,7 +335,15 @@ export async function handleBridgeRun(
state.bridgePendingToolCallMarkup = undefined
flushBridgePendingToDb(state, session_id)
updateSessionStats(session_id)
const message = err instanceof Error ? err.message : String(err)
const rawMessage = err instanceof Error ? err.message : String(err)
const message = bridgeRestarted ? BRIDGE_RESTART_USER_MESSAGE : rawMessage
if (bridgeRestarted) {
bridgeLogger.warn({
sessionId: session_id,
runId: state.runId,
rawError: rawMessage,
Comment on lines +338 to +344
}, '[chat-run-socket] bridge worker restarted mid-run; recovering session')
}
const errUsage = await calcAndUpdateUsage(session_id, state, emit)
const errContextTokens = await refreshFinalContextUsage({
sessionId: session_id,
Expand Down
Loading