Skip to content

Commit a683e90

Browse files
authored
fix: Catch exceptions from user-provided retry handlers to prevent orchestration crash (#193)
1 parent 9d5bbc3 commit a683e90

3 files changed

Lines changed: 148 additions & 2 deletions

File tree

packages/durabletask-js/src/worker/logs.ts

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -76,6 +76,7 @@ const CATEGORY_ENTITIES = "Microsoft.DurableTask.Worker.Entities";
7676
/** @internal */ export const EVENT_ACTIVITY_EXECUTION_ERROR = 734;
7777
/** @internal */ export const EVENT_ACTIVITY_RESPONSE_ERROR = 735;
7878
/** @internal */ export const EVENT_STREAM_ERROR_INFO = 736;
79+
/** @internal */ export const EVENT_RETRY_HANDLER_EXCEPTION = 737;
7980

8081
// ── Entity-specific Event IDs (800+ range) ──────────────────────────────────
8182

@@ -194,6 +195,24 @@ export function retryingTask(logger: Logger, instanceId: string, name: string, a
194195
}, `${instanceId}: Evaluating custom retry handler for failed '${name}' task. Attempt = ${attempt}.`);
195196
}
196197

198+
/**
199+
* Logs that a retry handler or handleFailure predicate threw an exception.
200+
* The task will not be retried and will fail with its original error.
201+
*/
202+
export function retryHandlerException(logger: Logger, instanceId: string, name: string, error: unknown): void {
203+
const msg = toErrorMessage(error);
204+
emitLog(logger, "warn", {
205+
eventId: EVENT_RETRY_HANDLER_EXCEPTION,
206+
category: CATEGORY_ORCHESTRATIONS,
207+
properties: {
208+
instanceId,
209+
name,
210+
error: msg,
211+
...(error instanceof Error && error.stack ? { stack: error.stack } : {}),
212+
},
213+
}, `${instanceId}: Retry evaluation for '${name}' threw an exception and will not be retried: ${msg}`);
214+
}
215+
197216
// ═══════════════════════════════════════════════════════════════════════════════
198217
// JS-specific Worker Lifecycle Logs (Event IDs 700+)
199218
// ═══════════════════════════════════════════════════════════════════════════════

packages/durabletask-js/src/worker/orchestration-executor.ts

Lines changed: 20 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -765,7 +765,16 @@ export class OrchestrationExecutor {
765765
): Promise<boolean> {
766766
if (task instanceof RetryableTask) {
767767
task.recordFailure(errorMessage, failureDetails);
768-
const nextDelayMs = task.computeNextDelayInMilliseconds(ctx._currentUtcDatetime);
768+
let nextDelayMs: number | undefined;
769+
try {
770+
nextDelayMs = task.computeNextDelayInMilliseconds(ctx._currentUtcDatetime);
771+
} catch (e: unknown) {
772+
// The retry policy's handleFailure predicate threw an exception.
773+
// Treat this as "don't retry" so the task fails with its original error
774+
// rather than crashing the entire orchestration.
775+
WorkerLogs.retryHandlerException(this._logger, ctx._instanceId, task.taskName, e);
776+
return false;
777+
}
769778

770779
if (nextDelayMs !== undefined) {
771780
WorkerLogs.retryingTask(this._logger, ctx._instanceId, task.taskName, task.attemptCount);
@@ -776,7 +785,16 @@ export class OrchestrationExecutor {
776785
}
777786
} else if (task instanceof RetryHandlerTask) {
778787
task.recordFailure(errorMessage, failureDetails);
779-
const retryResult = await task.shouldRetry(ctx._currentUtcDatetime);
788+
let retryResult: boolean | number;
789+
try {
790+
retryResult = await task.shouldRetry(ctx._currentUtcDatetime);
791+
} catch (e: unknown) {
792+
// The user-provided retry handler threw an exception.
793+
// Treat this as "don't retry" so the task fails with its original error
794+
// rather than crashing the entire orchestration.
795+
WorkerLogs.retryHandlerException(this._logger, ctx._instanceId, task.taskName, e);
796+
return false;
797+
}
780798

781799
// Only retry when the handler explicitly returns true or a finite number.
782800
// Using a positive check (=== true || finite number) instead of !== false

packages/durabletask-js/test/orchestration_executor.spec.ts

Lines changed: 109 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1401,6 +1401,115 @@ describe("Orchestration Executor", () => {
14011401
const completeAction = getAndValidateSingleCompleteOrchestrationAction(result);
14021402
expect(completeAction?.getOrchestrationstatus()).toEqual(pb.OrchestrationStatus.ORCHESTRATION_STATUS_FAILED);
14031403
});
1404+
1405+
it("should fail the task (not crash orchestration) when retry handler throws an exception", async () => {
1406+
const { result: startResult, replay } = await startOrchestration(
1407+
async function* (ctx: OrchestrationContext): any {
1408+
const retryHandler = async (_retryCtx: any) => {
1409+
throw new Error("Handler exploded!");
1410+
};
1411+
return yield ctx.callActivity("flakyActivity", undefined, { retry: retryHandler });
1412+
},
1413+
);
1414+
1415+
expect(startResult.actions[0].hasScheduletask()).toBe(true);
1416+
1417+
// Activity fails → handler throws → should NOT retry, task should fail with original error
1418+
const result = await replay(
1419+
[newTaskScheduledEvent(1, "flakyActivity")],
1420+
[newTaskFailedEvent(1, new Error("Original activity error"))],
1421+
);
1422+
const completeAction = getAndValidateSingleCompleteOrchestrationAction(result);
1423+
// The orchestration fails with the original task error, not the handler exception
1424+
expect(completeAction?.getOrchestrationstatus()).toEqual(pb.OrchestrationStatus.ORCHESTRATION_STATUS_FAILED);
1425+
expect(completeAction?.getFailuredetails()?.getErrormessage()).toContain("Original activity error");
1426+
});
1427+
1428+
it("should allow orchestrator to catch TaskFailedError when retry handler throws", async () => {
1429+
const { result: startResult, replay } = await startOrchestration(
1430+
async function* (ctx: OrchestrationContext): any {
1431+
const retryHandler = async (_retryCtx: any) => {
1432+
throw new Error("Handler exploded!");
1433+
};
1434+
try {
1435+
yield ctx.callActivity("flakyActivity", undefined, { retry: retryHandler });
1436+
} catch (e: any) {
1437+
return "caught: " + e.message;
1438+
}
1439+
},
1440+
);
1441+
1442+
expect(startResult.actions[0].hasScheduletask()).toBe(true);
1443+
1444+
// Activity fails → handler throws → task fails → orchestrator catches the error
1445+
const result = await replay(
1446+
[newTaskScheduledEvent(1, "flakyActivity")],
1447+
[newTaskFailedEvent(1, new Error("Original activity error"))],
1448+
);
1449+
const completeAction = getAndValidateSingleCompleteOrchestrationAction(result);
1450+
expect(completeAction?.getOrchestrationstatus()).toEqual(pb.OrchestrationStatus.ORCHESTRATION_STATUS_COMPLETED);
1451+
expect(completeAction?.getResult()?.getValue()).toContain("Original activity error");
1452+
});
1453+
1454+
it("should fail the task (not crash orchestration) when handleFailure predicate throws", async () => {
1455+
const { RetryPolicy } = await import("../src/task/retry/retry-policy");
1456+
1457+
const { result: startResult, replay } = await startOrchestration(
1458+
async function* (ctx: OrchestrationContext): any {
1459+
const retryPolicy = new RetryPolicy({
1460+
maxNumberOfAttempts: 3,
1461+
firstRetryIntervalInMilliseconds: 1000,
1462+
handleFailure: (_failure: any) => {
1463+
throw new Error("Predicate exploded!");
1464+
},
1465+
});
1466+
return yield ctx.callActivity("flakyActivity", undefined, { retry: retryPolicy });
1467+
},
1468+
);
1469+
1470+
expect(startResult.actions[0].hasScheduletask()).toBe(true);
1471+
1472+
// Activity fails → handleFailure throws → should NOT retry, task should fail with original error
1473+
const result = await replay(
1474+
[newTaskScheduledEvent(1, "flakyActivity")],
1475+
[newTaskFailedEvent(1, new Error("Original activity error"))],
1476+
);
1477+
const completeAction = getAndValidateSingleCompleteOrchestrationAction(result);
1478+
expect(completeAction?.getOrchestrationstatus()).toEqual(pb.OrchestrationStatus.ORCHESTRATION_STATUS_FAILED);
1479+
expect(completeAction?.getFailuredetails()?.getErrormessage()).toContain("Original activity error");
1480+
});
1481+
1482+
it("should allow orchestrator to catch TaskFailedError when handleFailure predicate throws", async () => {
1483+
const { RetryPolicy } = await import("../src/task/retry/retry-policy");
1484+
1485+
const { result: startResult, replay } = await startOrchestration(
1486+
async function* (ctx: OrchestrationContext): any {
1487+
const retryPolicy = new RetryPolicy({
1488+
maxNumberOfAttempts: 3,
1489+
firstRetryIntervalInMilliseconds: 1000,
1490+
handleFailure: (_failure: any) => {
1491+
throw new Error("Predicate exploded!");
1492+
},
1493+
});
1494+
try {
1495+
yield ctx.callActivity("flakyActivity", undefined, { retry: retryPolicy });
1496+
} catch (e: any) {
1497+
return "caught: " + e.message;
1498+
}
1499+
},
1500+
);
1501+
1502+
expect(startResult.actions[0].hasScheduletask()).toBe(true);
1503+
1504+
// Activity fails → predicate throws → task fails → orchestrator catches the error
1505+
const result = await replay(
1506+
[newTaskScheduledEvent(1, "flakyActivity")],
1507+
[newTaskFailedEvent(1, new Error("Original activity error"))],
1508+
);
1509+
const completeAction = getAndValidateSingleCompleteOrchestrationAction(result);
1510+
expect(completeAction?.getOrchestrationstatus()).toEqual(pb.OrchestrationStatus.ORCHESTRATION_STATUS_COMPLETED);
1511+
expect(completeAction?.getResult()?.getValue()).toContain("Original activity error");
1512+
});
14041513
});
14051514

14061515
it("should complete immediately when whenAll is called with an empty task array", async () => {

0 commit comments

Comments
 (0)