Skip to content

Commit 62de07c

Browse files
committed
improve error reporting for supervisor
1 parent 3d906a8 commit 62de07c

File tree

3 files changed

+86
-10
lines changed

3 files changed

+86
-10
lines changed

apps/supervisor/src/services/failedPodHandler.ts

Lines changed: 60 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@ export class FailedPodHandler {
2626
private readonly informer: Informer<V1Pod>;
2727
private readonly reconnectIntervalMs: number;
2828
private reconnecting = false;
29+
private reconnectAttempt = 0;
2930

3031
// Metrics
3132
private readonly register: Registry;
@@ -271,24 +272,34 @@ export class FailedPodHandler {
271272
this.reconnecting = true;
272273

273274
try {
274-
const error = err instanceof Error ? err : undefined;
275+
const errorDetails = this.getErrorDetails(err);
276+
const reconnectDelayMs = Math.min(
277+
this.reconnectIntervalMs * 2 ** Math.max(this.reconnectAttempt, 0),
278+
30_000
279+
);
275280
this.logger.error("error event fired", {
276281
informerName,
277-
error: error?.message,
278-
errorType: error?.name,
282+
reconnectAttempt: this.reconnectAttempt + 1,
283+
reconnectDelayMs,
284+
...errorDetails,
279285
});
280286
this.informerEventsTotal.inc({ namespace: this.namespace, verb: "error" });
287+
this.reconnectAttempt++;
281288

282289
// Reconnect on errors
283-
await setTimeout(this.reconnectIntervalMs);
290+
await setTimeout(reconnectDelayMs);
291+
await this.informer.stop().catch((stopError) => {
292+
this.logger.warn("onError: informer stop before reconnect failed", {
293+
informerName,
294+
...this.getErrorDetails(stopError),
295+
});
296+
});
284297
await this.informer.start();
285298
} catch (handlerError) {
286-
const error = handlerError instanceof Error ? handlerError : undefined;
287299
this.logger.error("onError: reconnection attempt failed", {
288300
informerName,
289-
error: error?.message,
290-
errorType: error?.name,
291-
errorStack: error?.stack,
301+
reconnectAttempt: this.reconnectAttempt,
302+
...this.getErrorDetails(handlerError),
292303
});
293304
} finally {
294305
this.reconnecting = false;
@@ -300,10 +311,51 @@ export class FailedPodHandler {
300311
}
301312

302313
private async onConnect(informerName: string) {
314+
this.reconnectAttempt = 0;
303315
this.logger.info(`informer connected: ${informerName}`);
304316
this.informerEventsTotal.inc({ namespace: this.namespace, verb: "connect" });
305317
}
306318

319+
private getErrorDetails(error: unknown) {
320+
if (error instanceof Error) {
321+
return {
322+
error: error.message,
323+
errorType: error.name,
324+
errorStack: error.stack,
325+
errorCause:
326+
error.cause instanceof Error
327+
? {
328+
name: error.cause.name,
329+
message: error.cause.message,
330+
}
331+
: error.cause,
332+
};
333+
}
334+
335+
if (typeof error === "object" && error !== null) {
336+
const details = error as Record<string, unknown>;
337+
const message = typeof details.message === "string" ? details.message : undefined;
338+
const code = typeof details.code === "string" ? details.code : undefined;
339+
const statusCode =
340+
typeof details.statusCode === "number"
341+
? details.statusCode
342+
: typeof details.statusCode === "string"
343+
? Number(details.statusCode)
344+
: undefined;
345+
346+
return {
347+
error: message,
348+
errorCode: code,
349+
statusCode: Number.isNaN(statusCode) ? undefined : statusCode,
350+
rawError: details,
351+
};
352+
}
353+
354+
return {
355+
rawError: error,
356+
};
357+
}
358+
307359
private podSummary(pod: V1Pod) {
308360
return {
309361
name: pod.metadata?.name,

packages/cli-v3/src/entryPoints/managed/controller.ts

Lines changed: 18 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -471,10 +471,27 @@ export class ManagedRunController {
471471
});
472472

473473
socket.on("connect_error", (error) => {
474+
const errorDetails =
475+
error instanceof Error
476+
? {
477+
name: error.name,
478+
message: error.message,
479+
cause:
480+
error.cause instanceof Error
481+
? {
482+
name: error.cause.name,
483+
message: error.cause.message,
484+
}
485+
: error.cause,
486+
}
487+
: {
488+
value: String(error),
489+
};
490+
474491
this.sendDebugLog({
475492
runId: this.runFriendlyId,
476493
message: "Socket connection error",
477-
properties: { error: error instanceof Error ? error.message : String(error) },
494+
properties: { error: errorDetails },
478495
});
479496
});
480497

packages/core/src/v3/apiClient/core.ts

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -747,7 +747,14 @@ export async function wrapZodFetch<T extends z.ZodTypeAny>(
747747
data: response,
748748
};
749749
} catch (error) {
750-
if (error instanceof ApiError) {
750+
if (error instanceof ApiConnectionError) {
751+
const causeMessage = error.cause instanceof Error ? error.cause.message : undefined;
752+
753+
return {
754+
success: false,
755+
error: causeMessage ? `${error.message} (${causeMessage})` : error.message,
756+
};
757+
} else if (error instanceof ApiError) {
751758
return {
752759
success: false,
753760
error: error.message,

0 commit comments

Comments
 (0)