Skip to content

Commit 5d10b3a

Browse files
authored
fix(gastown): surface startup failures (#3650)
* fix(gastown): surface startup rate-limit failures * chore(gastown): clarify startup error classification wording
1 parent b659fd4 commit 5d10b3a

5 files changed

Lines changed: 290 additions & 12 deletions

File tree

services/gastown/container/src/control-server.ts

Lines changed: 12 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,7 @@ import type {
3737
StreamTicketResponse,
3838
MergeResult,
3939
} from './types';
40+
import { classifyStartupError } from './startup-error';
4041

4142
const MAX_TICKETS = 1000;
4243
const streamTickets = new Map<string, { agentId: string; expiresAt: number }>();
@@ -359,9 +360,17 @@ app.post('/agents/start', async c => {
359360
} = agent;
360361
return c.json(safeAgent, 201);
361362
} catch (err) {
362-
const message = err instanceof Error ? err.message : String(err);
363-
console.error(`[control-server] /agents/start: FAILED for ${parsed.data.name}: ${message}`);
364-
return c.json({ error: message }, 500);
363+
const failure = classifyStartupError(err);
364+
const details = [
365+
`error=${failure.error}`,
366+
failure.phase ? `phase=${failure.phase}` : null,
367+
failure.status ? `status=${failure.status}` : null,
368+
failure.error_type ? `error_type=${failure.error_type}` : null,
369+
].filter(value => value !== null);
370+
console.error(
371+
`[control-server] /agents/start: FAILED for ${parsed.data.name}: ${details.join(' ')}`
372+
);
373+
return c.json(failure, 500);
365374
}
366375
});
367376

services/gastown/container/src/process-manager.ts

Lines changed: 13 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@ import {
2323
} from './control-server';
2424
import { log } from './logger';
2525
import { refreshTokenIfNearExpiry } from './token-refresh';
26+
import { AgentStartupError, classifyStartupError } from './startup-error';
2627

2728
const MANAGER_LOG = '[process-manager]';
2829

@@ -1319,14 +1320,18 @@ async function startAgentImpl(
13191320
// history is already in kilo.db and re-sending the startup prompt
13201321
// would create a duplicate turn.
13211322
if (!resumed) {
1322-
await client.session.prompt({
1323-
path: { id: sessionId },
1324-
body: {
1325-
parts: [{ type: 'text', text: request.prompt }],
1326-
...(modelParam ? { model: modelParam } : {}),
1327-
...(request.systemPrompt ? { system: request.systemPrompt } : {}),
1328-
},
1329-
});
1323+
try {
1324+
await client.session.prompt({
1325+
path: { id: sessionId },
1326+
body: {
1327+
parts: [{ type: 'text', text: request.prompt }],
1328+
...(modelParam ? { model: modelParam } : {}),
1329+
...(request.systemPrompt ? { system: request.systemPrompt } : {}),
1330+
},
1331+
});
1332+
} catch (err) {
1333+
throw new AgentStartupError(classifyStartupError(err, 'initial_prompt'));
1334+
}
13301335

13311336
// If the event stream errored while we were awaiting the prompt,
13321337
// the stream-error handler already set the agent to 'failed',
Lines changed: 56 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,56 @@
1+
import { describe, expect, it } from 'vitest';
2+
import { AgentStartupError, classifyStartupError } from './startup-error';
3+
4+
describe('classifyStartupError', () => {
5+
it('classifies initial prompt rate-limit gateway failures', () => {
6+
const payload = classifyStartupError(
7+
{
8+
status: 429,
9+
body: {
10+
error: 'Free model rate limit exceeded',
11+
error_type: 'rate_limit_exceeded',
12+
},
13+
},
14+
'initial_prompt'
15+
);
16+
17+
expect(payload).toEqual({
18+
error: 'Free model rate limit exceeded',
19+
phase: 'initial_prompt',
20+
status: 429,
21+
error_type: 'rate_limit_exceeded',
22+
action: 'Wait and retry, or switch the town/rig to a model with available quota.',
23+
});
24+
});
25+
26+
it('extracts gateway details embedded in SDK error messages', () => {
27+
const payload = classifyStartupError(
28+
new Error(
29+
'Request failed: {"error":{"message":"quota exhausted"},"error_type":"rate_limit_exceeded"}'
30+
),
31+
'initial_prompt'
32+
);
33+
34+
expect(payload).toEqual({
35+
error: 'quota exhausted',
36+
phase: 'initial_prompt',
37+
error_type: 'rate_limit_exceeded',
38+
});
39+
});
40+
41+
it('preserves already-classified startup payloads', () => {
42+
const error = new AgentStartupError({
43+
error: 'classified failure',
44+
phase: 'initial_prompt',
45+
status: 500,
46+
error_type: 'server_error',
47+
});
48+
49+
expect(classifyStartupError(error)).toEqual({
50+
error: 'classified failure',
51+
phase: 'initial_prompt',
52+
status: 500,
53+
error_type: 'server_error',
54+
});
55+
});
56+
});
Lines changed: 179 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,179 @@
1+
import { z } from 'zod';
2+
3+
export const AgentStartupPhase = z.enum(['initial_prompt']);
4+
export type AgentStartupPhase = z.infer<typeof AgentStartupPhase>;
5+
6+
export type AgentStartupErrorPayload = {
7+
error: string;
8+
phase?: AgentStartupPhase;
9+
status?: number;
10+
error_type?: string;
11+
action?: string;
12+
};
13+
14+
const StringRecord = z.record(z.string(), z.unknown());
15+
const GatewayErrorBody = z
16+
.object({
17+
error: z.unknown().optional(),
18+
error_type: z.string().optional(),
19+
message: z.string().optional(),
20+
})
21+
.passthrough();
22+
const ErrorObject = z
23+
.object({
24+
message: z.string().optional(),
25+
status: z.number().optional(),
26+
statusCode: z.number().optional(),
27+
code: z.union([z.string(), z.number()]).optional(),
28+
body: z.unknown().optional(),
29+
data: z.unknown().optional(),
30+
response: z
31+
.object({
32+
status: z.number().optional(),
33+
statusCode: z.number().optional(),
34+
body: z.unknown().optional(),
35+
data: z.unknown().optional(),
36+
})
37+
.passthrough()
38+
.optional(),
39+
})
40+
.passthrough();
41+
42+
export class AgentStartupError extends Error {
43+
readonly payload: AgentStartupErrorPayload;
44+
45+
constructor(payload: AgentStartupErrorPayload) {
46+
super(payload.error);
47+
this.name = 'AgentStartupError';
48+
this.payload = payload;
49+
}
50+
}
51+
52+
type JsonParseResult = { success: true; data: unknown } | { success: false };
53+
54+
function parseJsonFromString(value: string): JsonParseResult {
55+
const trimmed = value.trim();
56+
if (!trimmed) return { success: false };
57+
58+
try {
59+
return { success: true, data: JSON.parse(trimmed) };
60+
} catch {
61+
const start = trimmed.indexOf('{');
62+
const end = trimmed.lastIndexOf('}');
63+
if (start === -1 || end <= start) return { success: false };
64+
try {
65+
return { success: true, data: JSON.parse(trimmed.slice(start, end + 1)) };
66+
} catch {
67+
return { success: false };
68+
}
69+
}
70+
}
71+
72+
function readGatewayBody(value: unknown): z.infer<typeof GatewayErrorBody> | null {
73+
if (typeof value === 'string') {
74+
const parsed = parseJsonFromString(value);
75+
if (!parsed.success) return null;
76+
return readGatewayBody(parsed.data);
77+
}
78+
79+
const parsed = GatewayErrorBody.safeParse(value);
80+
if (!parsed.success) return null;
81+
if (
82+
parsed.data.error === undefined &&
83+
parsed.data.error_type === undefined &&
84+
parsed.data.message === undefined
85+
) {
86+
return null;
87+
}
88+
return parsed.data;
89+
}
90+
91+
function readBodyMessage(errorValue: unknown): string | undefined {
92+
if (typeof errorValue === 'string') return errorValue;
93+
94+
const record = StringRecord.safeParse(errorValue);
95+
if (!record.success) return undefined;
96+
97+
const message = record.data.message;
98+
if (typeof message === 'string') return message;
99+
100+
const code = record.data.code;
101+
if (typeof code === 'string') return code;
102+
103+
return undefined;
104+
}
105+
106+
function gatewayBodyFromError(err: unknown): z.infer<typeof GatewayErrorBody> | null {
107+
if (err instanceof Error) {
108+
const fromMessage = readGatewayBody(err.message);
109+
if (fromMessage) return fromMessage;
110+
}
111+
112+
const direct = readGatewayBody(err);
113+
if (direct) return direct;
114+
115+
const parsed = ErrorObject.safeParse(err);
116+
if (!parsed.success) return null;
117+
118+
return (
119+
readGatewayBody(parsed.data.body) ??
120+
readGatewayBody(parsed.data.data) ??
121+
readGatewayBody(parsed.data.response?.body) ??
122+
readGatewayBody(parsed.data.response?.data) ??
123+
(parsed.data.message ? readGatewayBody(parsed.data.message) : null)
124+
);
125+
}
126+
127+
function statusFromError(err: unknown): number | undefined {
128+
const parsed = ErrorObject.safeParse(err);
129+
if (!parsed.success) return undefined;
130+
131+
const status =
132+
parsed.data.status ??
133+
parsed.data.statusCode ??
134+
parsed.data.response?.status ??
135+
parsed.data.response?.statusCode;
136+
if (status) return status;
137+
138+
const code = parsed.data.code;
139+
if (typeof code === 'number') return code;
140+
if (typeof code === 'string' && /^\d{3}$/.test(code)) return Number(code);
141+
return undefined;
142+
}
143+
144+
function messageFromError(err: unknown): string {
145+
if (err instanceof Error) return err.message;
146+
if (typeof err === 'string') return err;
147+
return 'Agent startup failed';
148+
}
149+
150+
export function classifyStartupError(
151+
err: unknown,
152+
phase?: AgentStartupPhase
153+
): AgentStartupErrorPayload {
154+
if (err instanceof AgentStartupError) return err.payload;
155+
156+
const body = gatewayBodyFromError(err);
157+
const status = statusFromError(err);
158+
const errorType = body?.error_type;
159+
const gatewayMessage = readBodyMessage(body?.error) ?? body?.message;
160+
161+
if (phase === 'initial_prompt' && status === 429 && errorType === 'rate_limit_exceeded') {
162+
return {
163+
error:
164+
gatewayMessage ??
165+
'Kilo gateway rejected the initial prompt because the selected model is rate limited.',
166+
phase,
167+
status,
168+
error_type: errorType,
169+
action: 'Wait and retry, or switch the town/rig to a model with available quota.',
170+
};
171+
}
172+
173+
return {
174+
error: gatewayMessage ?? messageFromError(err),
175+
...(phase ? { phase } : {}),
176+
...(status ? { status } : {}),
177+
...(errorType ? { error_type: errorType } : {}),
178+
};
179+
}

services/gastown/src/dos/town/container-dispatch.ts

Lines changed: 30 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
* All container communication goes through the TownContainerDO stub.
44
*/
55

6+
import { z } from 'zod';
67
import { getTownContainerStub } from '../TownContainer.do';
78
import { signAgentJWT, signContainerJWT } from '../../util/jwt.util';
89
import { buildPolecatSystemPrompt } from '../../prompts/polecat-system.prompt';
@@ -14,6 +15,14 @@ import { resolveGitHubTokenString } from './town-scm';
1415

1516
const TOWN_LOG = '[Town.do]';
1617

18+
const ContainerStartError = z.object({
19+
error: z.string(),
20+
phase: z.string().optional(),
21+
status: z.number().optional(),
22+
error_type: z.string().optional(),
23+
action: z.string().optional(),
24+
});
25+
1726
// Allowlist of git push flags that are safe to pass from rig config.
1827
// Flags that bypass hooks (--no-verify), rewrite history (--force,
1928
// --force-with-lease), or alter remote refs in dangerous ways are
@@ -60,6 +69,26 @@ export function getLastStartError(): string | null {
6069
return lastStartError;
6170
}
6271

72+
function formatContainerStartError(status: number, bodyText: string): string {
73+
let parsedJson: unknown;
74+
try {
75+
parsedJson = JSON.parse(bodyText);
76+
} catch {
77+
return `(${status}) ${bodyText.slice(0, 300)}`;
78+
}
79+
const parsed = ContainerStartError.safeParse(parsedJson);
80+
if (!parsed.success) return `(${status}) ${bodyText.slice(0, 300)}`;
81+
82+
const failure = parsed.data;
83+
const details = [
84+
failure.phase ? `${failure.phase} failed` : 'container start failed',
85+
failure.error_type ?? null,
86+
failure.status ? `upstream ${failure.status}` : null,
87+
].filter(value => value !== null);
88+
const action = failure.action ? ` Action: ${failure.action}` : '';
89+
return `(${status}) ${details.join(': ')}: ${failure.error}${action}`.slice(0, 500);
90+
}
91+
6392
/**
6493
* Resolve the GASTOWN_JWT_SECRET binding to a string.
6594
*/
@@ -553,7 +582,7 @@ export async function startAgentInContainer(
553582
});
554583
return { started: true, containerFetchMs: durationMs };
555584
}
556-
const errorMsg = `(${response.status}) ${text.slice(0, 300)}`;
585+
const errorMsg = formatContainerStartError(response.status, text);
557586
console.error(
558587
`${TOWN_LOG} startAgentInContainer: error response for ` +
559588
`agent=${params.agentId} role=${params.role}: ${errorMsg}`

0 commit comments

Comments
 (0)