From 94092a35a4b90db5650d2eab3cc4758cf8a912b5 Mon Sep 17 00:00:00 2001 From: Vegard Hansen Date: Tue, 26 May 2026 22:21:47 +0200 Subject: [PATCH 1/2] =?UTF-8?q?fix(lambda):=20fail-open=20isJobQueued=20?= =?UTF-8?q?=E2=80=94=20assume=20queued=20on=20API=20errors?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Wrap the isJobQueued check in a try/catch that assumes the job is still queued when the GitHub API returns an error (404, rate limit, 502). This prevents silent job drops when the API is transiently unavailable. Previously, any error from getJobForWorkflowRun would propagate up and (combined with the non-ScaleError catch behavior) cause the entire SQS batch to be dropped. Fixes #5026 --- .../src/scale-runners/scale-up.test.ts | 14 ++++++++++++++ .../control-plane/src/scale-runners/scale-up.ts | 15 +++++++++++---- 2 files changed, 25 insertions(+), 4 deletions(-) diff --git a/lambdas/functions/control-plane/src/scale-runners/scale-up.test.ts b/lambdas/functions/control-plane/src/scale-runners/scale-up.test.ts index 8ac2c14489..72b3c5ef1d 100644 --- a/lambdas/functions/control-plane/src/scale-runners/scale-up.test.ts +++ b/lambdas/functions/control-plane/src/scale-runners/scale-up.test.ts @@ -863,6 +863,20 @@ describe('scaleUp with GHES', () => { }), ); }); + + it('Should assume job is queued when isJobQueued throws (fail-open)', async () => { + mockOctokit.actions.getJobForWorkflowRun.mockRejectedValue(new Error('GitHub API 502')); + + const messages = createTestMessages(2); + await scaleUpModule.scaleUp(messages); + + // All messages processed despite API error — fail-open prevents job drops + expect(createRunner).toHaveBeenCalledWith( + expect.objectContaining({ + numberOfRunners: 2, + }), + ); + }); }); }); diff --git a/lambdas/functions/control-plane/src/scale-runners/scale-up.ts b/lambdas/functions/control-plane/src/scale-runners/scale-up.ts index 395c87e8f8..1b7edf873e 100644 --- a/lambdas/functions/control-plane/src/scale-runners/scale-up.ts +++ b/lambdas/functions/control-plane/src/scale-runners/scale-up.ts @@ -416,10 +416,17 @@ export async function scaleUp(payloads: ActionRequestMessageSQS[]): Promise Date: Tue, 26 May 2026 22:58:33 +0200 Subject: [PATCH 2/2] refactor: sanitize error object in isJobQueued warning log Log only message and status instead of the full error object to avoid leaking request/response metadata from Octokit errors. --- .../functions/control-plane/src/scale-runners/scale-up.ts | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/lambdas/functions/control-plane/src/scale-runners/scale-up.ts b/lambdas/functions/control-plane/src/scale-runners/scale-up.ts index 1b7edf873e..48333bdc11 100644 --- a/lambdas/functions/control-plane/src/scale-runners/scale-up.ts +++ b/lambdas/functions/control-plane/src/scale-runners/scale-up.ts @@ -421,7 +421,11 @@ export async function scaleUp(payloads: ActionRequestMessageSQS[]): Promise