From a4f8d980c3c3683f08d547b5cda89fef902b1e11 Mon Sep 17 00:00:00 2001 From: Renaud Martinet Date: Fri, 19 Jun 2026 18:29:30 +0200 Subject: [PATCH 01/13] fix(scale-down): batch TerminateInstances calls to reduce EC2 quota pressure EC2 bucket-1 (Fleet mutations) was hitting p99=552 calls/min vs 540 limit. TerminateInstances was called once per runner; now all runners in a scale-down invocation are terminated in a single batched EC2 call (up to 100 per batch). - Add `terminateRunners(ids[])` with chunked batching (TERMINATE_BATCH_SIZE=100) and recursive bisect on error to isolate bad IDs while terminating all good ones - Keep `terminateRunner(id)` as back-compat wrapper (scale-up cleanup unchanged) - `removeRunner` returns instanceId instead of calling terminate directly - `evaluateAndRemoveRunners` + `terminateOrphan` collect IDs then call once - Add `scripts/check-terminate-quota.sh` for before/after metric snapshots Co-Authored-By: Claude Opus 4.8 --- .../control-plane/src/aws/runners.test.ts | 62 ++++++++++++++++++- .../control-plane/src/aws/runners.ts | 44 +++++++++++-- .../src/scale-runners/scale-down.test.ts | 38 +++++++----- .../src/scale-runners/scale-down.ts | 38 +++++++++--- scripts/check-terminate-quota.sh | 38 ++++++++++++ 5 files changed, 188 insertions(+), 32 deletions(-) create mode 100755 scripts/check-terminate-quota.sh diff --git a/lambdas/functions/control-plane/src/aws/runners.test.ts b/lambdas/functions/control-plane/src/aws/runners.test.ts index 63f1412dd0..86b0de6dd6 100644 --- a/lambdas/functions/control-plane/src/aws/runners.test.ts +++ b/lambdas/functions/control-plane/src/aws/runners.test.ts @@ -19,7 +19,7 @@ import 'aws-sdk-client-mock-jest/vitest'; import { beforeEach, describe, expect, it, vi } from 'vitest'; import ScaleError from './../scale-runners/ScaleError'; -import { createRunner, listEC2Runners, tag, terminateRunner, untag } from './runners'; +import { createRunner, listEC2Runners, tag, terminateRunner, terminateRunners, untag } from './runners'; import type { RunnerInfo, RunnerInputParameters, RunnerType } from './runners.d'; process.env.AWS_REGION = 'eu-east-1'; @@ -269,6 +269,66 @@ describe('terminate runner', () => { }); }); +describe('terminateRunners', () => { + beforeEach(() => { + vi.clearAllMocks(); + mockEC2Client.reset(); + }); + + it('does nothing for empty list', async () => { + await terminateRunners([]); + expect(mockEC2Client).not.toHaveReceivedCommand(TerminateInstancesCommand); + }); + + it('issues one EC2 call for a small batch', async () => { + mockEC2Client.on(TerminateInstancesCommand).resolves({ + TerminatingInstances: [ + { InstanceId: 'i-1', CurrentState: { Name: 'shutting-down' } }, + { InstanceId: 'i-2', CurrentState: { Name: 'shutting-down' } }, + ], + }); + await terminateRunners(['i-1', 'i-2']); + expect(mockEC2Client).toHaveReceivedCommandTimes(TerminateInstancesCommand, 1); + expect(mockEC2Client).toHaveReceivedCommandWith(TerminateInstancesCommand, { + InstanceIds: ['i-1', 'i-2'], + }); + }); + + it('chunks into batches of 100 for large lists', async () => { + mockEC2Client.on(TerminateInstancesCommand).resolves({}); + const ids = Array.from({ length: 150 }, (_, i) => `i-${i}`); + await terminateRunners(ids); + expect(mockEC2Client).toHaveReceivedCommandTimes(TerminateInstancesCommand, 2); + expect(mockEC2Client).toHaveReceivedNthCommandWith(1, TerminateInstancesCommand, { + InstanceIds: ids.slice(0, 100), + }); + expect(mockEC2Client).toHaveReceivedNthCommandWith(2, TerminateInstancesCommand, { + InstanceIds: ids.slice(100), + }); + }); + + it('bisects on error and terminates good instances', async () => { + // i-bad always throws; i-good and i-also-good succeed + mockEC2Client + .on(TerminateInstancesCommand, { InstanceIds: ['i-good', 'i-bad', 'i-also-good'] }) + .rejects(new Error('InvalidInstanceID.NotFound')) + .on(TerminateInstancesCommand, { InstanceIds: ['i-good', 'i-bad'] }) + .rejects(new Error('InvalidInstanceID.NotFound')) + .on(TerminateInstancesCommand, { InstanceIds: ['i-bad'] }) + .rejects(new Error('InvalidInstanceID.NotFound')) + .on(TerminateInstancesCommand, { InstanceIds: ['i-good'] }) + .resolves({}) + .on(TerminateInstancesCommand, { InstanceIds: ['i-also-good'] }) + .resolves({}); + + await terminateRunners(['i-good', 'i-bad', 'i-also-good']); + + // good instances must have been terminated despite the bad id + expect(mockEC2Client).toHaveReceivedCommandWith(TerminateInstancesCommand, { InstanceIds: ['i-good'] }); + expect(mockEC2Client).toHaveReceivedCommandWith(TerminateInstancesCommand, { InstanceIds: ['i-also-good'] }); + }); +}); + describe('tag runner', () => { beforeEach(() => { vi.clearAllMocks(); diff --git a/lambdas/functions/control-plane/src/aws/runners.ts b/lambdas/functions/control-plane/src/aws/runners.ts index 7f7f5750bf..db70274339 100644 --- a/lambdas/functions/control-plane/src/aws/runners.ts +++ b/lambdas/functions/control-plane/src/aws/runners.ts @@ -102,11 +102,47 @@ function getRunnerInfo(runningInstances: DescribeInstancesResult) { return runners; } -export async function terminateRunner(instanceId: string): Promise { - logger.debug(`Runner '${instanceId}' will be terminated.`); +// AWS TerminateInstances accepts up to 1000 InstanceIds per call; we keep batches +// at 100 to bound payload + blast radius (a failed batch bisects, see terminateBatch). +const TERMINATE_BATCH_SIZE = 100; + +export async function terminateRunners(instanceIds: string[]): Promise { + if (instanceIds.length === 0) return; // empty InstanceIds errors at the API const ec2 = getTracedAWSV3Client(new EC2Client({ region: process.env.AWS_REGION })); - await ec2.send(new TerminateInstancesCommand({ InstanceIds: [instanceId] })); - logger.debug(`Runner ${instanceId} has been terminated.`); + for (let i = 0; i < instanceIds.length; i += TERMINATE_BATCH_SIZE) { + await terminateBatch(ec2, instanceIds.slice(i, i + TERMINATE_BATCH_SIZE)); + } +} + +// Common path = 1 EC2 call for the whole batch. TerminateInstances is all-or-nothing +// (one bad id throws InvalidInstanceID.NotFound and nothing is terminated), so on +// error we BISECT. This is recursive: each half re-enters terminateBatch and, if it +// also throws, bisects again — the recursion keeps halving until every batch is +// either terminated whole or down to a single id that is logged and skipped. A +// failure isolates each bad id in O(log n) levels while all good ids still terminate. +async function terminateBatch(ec2: EC2Client, batch: string[]): Promise { + try { + const res = await ec2.send(new TerminateInstancesCommand({ InstanceIds: batch })); + const terminated = new Set((res.TerminatingInstances ?? []).map((i) => i.InstanceId)); + const missing = batch.filter((id) => !terminated.has(id)); + if (missing.length) logger.warn(`Terminate returned no state change for: ${missing.join(', ')}`); + logger.debug(`Runners terminated: ${batch.join(', ')}`); + } catch (e) { + if (batch.length === 1) { + // base case — isolated, surface and move on + logger.error(`Failed to terminate runner '${batch[0]}'`, { error: e as Error }); + return; + } + const mid = Math.ceil(batch.length / 2); + logger.warn(`Batch terminate failed (${batch.length} ids), bisecting.`, { error: e as Error }); + await terminateBatch(ec2, batch.slice(0, mid)); // recurses; bisects again on repeat failure + await terminateBatch(ec2, batch.slice(mid)); + } +} + +// Back-compat thin wrapper — existing single-id callers keep working. +export async function terminateRunner(instanceId: string): Promise { + await terminateRunners([instanceId]); } export async function tag(instanceId: string, tags: Tag[]): Promise { diff --git a/lambdas/functions/control-plane/src/scale-runners/scale-down.test.ts b/lambdas/functions/control-plane/src/scale-runners/scale-down.test.ts index 42fd442a3f..77fb44b8ea 100644 --- a/lambdas/functions/control-plane/src/scale-runners/scale-down.test.ts +++ b/lambdas/functions/control-plane/src/scale-runners/scale-down.test.ts @@ -5,7 +5,7 @@ import nock from 'nock'; import { RunnerInfo, RunnerList } from '../aws/runners.d'; import * as ghAuth from '../github/auth'; -import { listEC2Runners, terminateRunner, tag, untag } from './../aws/runners'; +import { listEC2Runners, terminateRunners, tag, untag } from './../aws/runners'; import { githubCache } from './cache'; import { newestFirstStrategy, oldestFirstStrategy, scaleDown } from './scale-down'; import { describe, it, expect, beforeEach, vi } from 'vitest'; @@ -37,7 +37,7 @@ vi.mock('./../aws/runners', async (importOriginal) => { ...actual, tag: vi.fn(), untag: vi.fn(), - terminateRunner: vi.fn(), + terminateRunners: vi.fn(), listEC2Runners: vi.fn(), }; }); @@ -68,7 +68,7 @@ const mockCreateClient = vi.mocked(ghAuth.createOctokitClient); const mockListRunners = vi.mocked(listEC2Runners); const mockTagRunners = vi.mocked(tag); const mockUntagRunners = vi.mocked(untag); -const mockTerminateRunners = vi.mocked(terminateRunner); +const mockTerminateRunners = vi.mocked(terminateRunners); export interface TestData { repositoryName: string; @@ -208,7 +208,7 @@ describe('Scale down runners', () => { expect(listEC2Runners).toHaveBeenCalledWith({ environment: ENVIRONMENT, }); - expect(terminateRunner).not.toHaveBeenCalled(); + expect(terminateRunners).not.toHaveBeenCalled(); expect(mockOctokit.apps.getRepoInstallation).not.toHaveBeenCalled(); expect(mockOctokit.apps.getRepoInstallation).not.toHaveBeenCalled(); }); @@ -303,7 +303,7 @@ describe('Scale down runners', () => { await scaleDown(); // assert - expect(terminateRunner).not.toHaveBeenCalled(); + expect(terminateRunners).not.toHaveBeenCalled(); checkNonTerminated(runners); }); @@ -407,7 +407,7 @@ describe('Scale down runners', () => { // assert expect(mockUntagRunners).toHaveBeenCalledWith(orphanRunner.instanceId, [{ Key: 'ghr:orphan', Value: 'true' }]); - expect(mockTerminateRunners).not.toHaveBeenCalledWith(orphanRunner.instanceId); + expect(mockTerminateRunners.mock.calls.flatMap((c) => c[0] as string[])).not.toContain(orphanRunner.instanceId); // arrange if (type === 'Repo') { @@ -424,7 +424,7 @@ describe('Scale down runners', () => { await scaleDown(); // assert - expect(mockTerminateRunners).toHaveBeenCalledWith(orphanRunner.instanceId); + expect(mockTerminateRunners.mock.calls.flatMap((c) => c[0] as string[])).toContain(orphanRunner.instanceId); }); it('Should handle 404 error when checking orphaned runner (JIT) - treat as orphaned', async () => { @@ -463,7 +463,7 @@ describe('Scale down runners', () => { await scaleDown(); // assert - should terminate since 404 means runner doesn't exist on GitHub - expect(mockTerminateRunners).toHaveBeenCalledWith(orphanRunner.instanceId); + expect(mockTerminateRunners.mock.calls.flatMap((c) => c[0] as string[])).toContain(orphanRunner.instanceId); }); it('Should handle 404 error when checking runner busy state - treat as not busy', async () => { @@ -539,7 +539,8 @@ describe('Scale down runners', () => { await expect(scaleDown()).resolves.not.toThrow(); // Should not terminate since the error was not a 404 - expect(terminateRunner).not.toHaveBeenCalledWith(orphanRunner.instanceId); + const calledIdsAfterError = mockTerminateRunners.mock.calls.flatMap((call) => call[0] as string[]); + expect(calledIdsAfterError).not.toContain(orphanRunner.instanceId); }); it(`Should ignore errors when termination orphan fails.`, async () => { @@ -664,14 +665,15 @@ describe('Scale down runners', () => { await scaleDown(); // assert + const allCalledIds = mockTerminateRunners.mock.calls.flatMap((call) => call[0] as string[]); const runnersToTerminate = runners.filter((r) => r.shouldBeTerminated); for (const toTerminate of runnersToTerminate) { - expect(terminateRunner).toHaveBeenCalledWith(toTerminate.instanceId); + expect(allCalledIds).toContain(toTerminate.instanceId); } const runnersNotToTerminate = runners.filter((r) => !r.shouldBeTerminated); for (const notTerminated of runnersNotToTerminate) { - expect(terminateRunner).not.toHaveBeenCalledWith(notTerminated.instanceId); + expect(allCalledIds).not.toContain(notTerminated.instanceId); } }); }); @@ -809,16 +811,18 @@ function mockAwsRunners(runners: RunnerTestItem[]) { function checkNonTerminated(runners: RunnerTestItem[]) { const notTerminated = runners.filter((r) => !r.shouldBeTerminated); - for (const toTerminate of notTerminated) { - expect(terminateRunner).not.toHaveBeenCalledWith(toTerminate.instanceId); + const allCalledIds = mockTerminateRunners.mock.calls.flatMap((call) => call[0] as string[]); + for (const r of notTerminated) { + expect(allCalledIds).not.toContain(r.instanceId); } } function checkTerminated(runners: RunnerTestItem[]) { - const runnersToTerminate = runners.filter((r) => r.shouldBeTerminated); - expect(terminateRunner).toHaveBeenCalledTimes(runnersToTerminate.length); - for (const toTerminate of runnersToTerminate) { - expect(terminateRunner).toHaveBeenCalledWith(toTerminate.instanceId); + const idsToTerminate = runners.filter((r) => r.shouldBeTerminated).map((r) => r.instanceId); + if (idsToTerminate.length === 0) return; + const allCalledIds = mockTerminateRunners.mock.calls.flatMap((call) => call[0] as string[]); + for (const id of idsToTerminate) { + expect(allCalledIds).toContain(id); } } diff --git a/lambdas/functions/control-plane/src/scale-runners/scale-down.ts b/lambdas/functions/control-plane/src/scale-runners/scale-down.ts index c92dddfca9..09426b0094 100644 --- a/lambdas/functions/control-plane/src/scale-runners/scale-down.ts +++ b/lambdas/functions/control-plane/src/scale-runners/scale-down.ts @@ -10,7 +10,7 @@ import { createOctokitClient, getStoredInstallationId, } from '../github/auth'; -import { bootTimeExceeded, listEC2Runners, tag, untag, terminateRunner } from './../aws/runners'; +import { bootTimeExceeded, listEC2Runners, tag, untag, terminateRunners } from './../aws/runners'; import { RunnerInfo, RunnerList } from './../aws/runners.d'; import { GhRunners, githubCache } from './cache'; import { ScalingDownConfig, getEvictionStrategy, getIdleRunnerCount } from './scale-down-config'; @@ -137,7 +137,9 @@ function runnerMinimumTimeExceeded(runner: RunnerInfo): boolean { return launchTimePlusMinimum < now; } -async function removeRunner(ec2runner: RunnerInfo, ghRunnerIds: number[]): Promise { +// Returns the instanceId when the runner was successfully de-registered from GitHub, +// so the caller can batch-terminate in one EC2 call. Returns undefined otherwise. +async function removeRunner(ec2runner: RunnerInfo, ghRunnerIds: number[]): Promise { const githubAppClient = await getOrCreateOctokit(ec2runner); try { const runnerList = ec2runner as unknown as RunnerList; @@ -145,7 +147,7 @@ async function removeRunner(ec2runner: RunnerInfo, ghRunnerIds: number[]): Promi logger.info( `Runner '${ec2runner.instanceId}' has bypass-removal tag set, skipping removal. Remove the tag to allow scale-down.`, ); - return; + return undefined; } const states = await Promise.all( @@ -174,18 +176,21 @@ async function removeRunner(ec2runner: RunnerInfo, ghRunnerIds: number[]): Promi ); if (statuses.every((status) => status == 204)) { - await terminateRunner(ec2runner.instanceId); - logger.info(`AWS runner instance '${ec2runner.instanceId}' is terminated and GitHub runner is de-registered.`); + logger.info(`GitHub runner de-registered for '${ec2runner.instanceId}', queued for termination.`); + return ec2runner.instanceId; } else { logger.error(`Failed to de-register GitHub runner: ${statuses}`); + return undefined; } } else { logger.info(`Runner '${ec2runner.instanceId}' cannot be de-registered, because it is still busy.`); + return undefined; } } catch (e) { logger.error(`Runner '${ec2runner.instanceId}' cannot be de-registered. Error: ${e}`, { error: e as Error, }); + return undefined; } } @@ -196,6 +201,7 @@ async function evaluateAndRemoveRunners( let idleCounter = getIdleRunnerCount(scaleDownConfigs); const evictionStrategy = getEvictionStrategy(scaleDownConfigs); const ownerTags = new Set(ec2Runners.map((runner) => runner.owner)); + const toTerminate: string[] = []; for (const ownerTag of ownerTags) { const ec2RunnersFiltered = ec2Runners @@ -221,10 +227,11 @@ async function evaluateAndRemoveRunners( logger.info(`Runner '${ec2Runner.instanceId}' will be kept idle.`); } else { logger.info(`Terminating all non busy runners.`); - await removeRunner( + const id = await removeRunner( ec2Runner, ghRunnersFiltered.map((runner: { id: number }) => runner.id), ); + if (id) toTerminate.push(id); } } } else if (bootTimeExceeded(ec2Runner)) { @@ -234,6 +241,11 @@ async function evaluateAndRemoveRunners( } } } + + if (toTerminate.length > 0) { + logger.info(`Terminating ${toTerminate.length} runner(s): ${toTerminate.join(', ')}`); + await terminateRunners(toTerminate); + } } async function markOrphan(instanceId: string): Promise { @@ -280,22 +292,28 @@ async function lastChanceCheckOrphanRunner(runner: RunnerList): Promise async function terminateOrphan(environment: string): Promise { try { const orphanRunners = await listEC2Runners({ environment, orphan: true }); + const toTerminate: string[] = []; for (const runner of orphanRunners) { if (runner.runnerId) { const isOrphan = await lastChanceCheckOrphanRunner(runner); if (isOrphan) { - await terminateRunner(runner.instanceId); + toTerminate.push(runner.instanceId); } else { await unMarkOrphan(runner.instanceId); } } else { logger.info(`Terminating orphan runner '${runner.instanceId}'`); - await terminateRunner(runner.instanceId).catch((e) => { - logger.error(`Failed to terminate orphan runner '${runner.instanceId}'`, { error: e }); - }); + toTerminate.push(runner.instanceId); } } + + if (toTerminate.length > 0) { + logger.info(`Terminating ${toTerminate.length} orphan runner(s): ${toTerminate.join(', ')}`); + await terminateRunners(toTerminate).catch((e) => { + logger.error(`Failed to terminate orphan runners: ${toTerminate.join(', ')}`, { error: e }); + }); + } } catch (e) { logger.warn(`Failure during orphan termination processing.`, { error: e }); } diff --git a/scripts/check-terminate-quota.sh b/scripts/check-terminate-quota.sh new file mode 100755 index 0000000000..962ae9bdda --- /dev/null +++ b/scripts/check-terminate-quota.sh @@ -0,0 +1,38 @@ +#!/usr/bin/env bash +# Snapshot EC2 bucket-1 (Fleet mutations) quota usage for cicd-prod (346156333547). +# Run before and after deploying the batched-terminate change to measure impact. +# +# Usage: check-terminate-quota.sh [--from=2d] +# --from accepts pup relative ranges: 1h, 2d, 7d, etc. (default: 2d) +# +# Requires: pup CLI authenticated (pup auth login) +set -euo pipefail + +FROM="${1:---from=2d}" + +q() { + local query="$1" + pup metrics query --query="$query" "$FROM" --output=json \ + | python3 -c ' +import sys, json +d = json.load(sys.stdin) +pts = [p[1] for p in d["data"]["series"][0]["pointlist"] if p[1] is not None] +if not pts: + print("no data") + sys.exit() +s = sorted(pts) +n = len(s) +print(f"p50={s[n//2]:.0f} p95={s[int(n*0.95)]:.0f} p99={s[int(n*0.99)]:.0f} max={max(pts):.0f} calls/min") +' +} + +TERMINATE='sum:aws.usage.call_count.sum{service:ec2,resource:terminateinstances}.rollup(max,60)' +BUCKET1="${TERMINATE}\ ++sum:aws.usage.call_count.sum{service:ec2,resource:createfleet}.rollup(max,60)\ ++sum:aws.usage.call_count.sum{service:ec2,resource:runinstances}.rollup(max,60)\ ++sum:aws.usage.call_count.sum{service:ec2,resource:deletenetworkinterfaces}.rollup(max,60)" + +echo "Window: $FROM" +echo "" +printf "%-30s %s\n" "TerminateInstances alone:" "$(q "$TERMINATE")" +printf "%-30s %s (sustained limit: 540)\n" "Bucket-1 total:" "$(q "$BUCKET1")" From 0056bd103fc6b43e5a638a8c01e0b8ef1dc2a7a4 Mon Sep 17 00:00:00 2001 From: Renaud Martinet Date: Tue, 23 Jun 2026 11:21:16 +0200 Subject: [PATCH 02/13] refactor(runners): replace bisection with per-id fallback loop on batch error Recursive bisection was O(log n) extra EC2 calls and required a 6-line comment to explain. A simple loop that retries each id individually on batch failure is equivalent in behaviour (bad id isolated at size 1, all good ids still terminate) and obvious at a glance. Co-Authored-By: Claude Opus 4.8 --- .../control-plane/src/aws/runners.test.ts | 4 +--- .../control-plane/src/aws/runners.ts | 19 +++++-------------- 2 files changed, 6 insertions(+), 17 deletions(-) diff --git a/lambdas/functions/control-plane/src/aws/runners.test.ts b/lambdas/functions/control-plane/src/aws/runners.test.ts index 86b0de6dd6..c2d889ae77 100644 --- a/lambdas/functions/control-plane/src/aws/runners.test.ts +++ b/lambdas/functions/control-plane/src/aws/runners.test.ts @@ -307,13 +307,11 @@ describe('terminateRunners', () => { }); }); - it('bisects on error and terminates good instances', async () => { + it('retries individually on batch error and terminates good instances', async () => { // i-bad always throws; i-good and i-also-good succeed mockEC2Client .on(TerminateInstancesCommand, { InstanceIds: ['i-good', 'i-bad', 'i-also-good'] }) .rejects(new Error('InvalidInstanceID.NotFound')) - .on(TerminateInstancesCommand, { InstanceIds: ['i-good', 'i-bad'] }) - .rejects(new Error('InvalidInstanceID.NotFound')) .on(TerminateInstancesCommand, { InstanceIds: ['i-bad'] }) .rejects(new Error('InvalidInstanceID.NotFound')) .on(TerminateInstancesCommand, { InstanceIds: ['i-good'] }) diff --git a/lambdas/functions/control-plane/src/aws/runners.ts b/lambdas/functions/control-plane/src/aws/runners.ts index db70274339..182cdb0000 100644 --- a/lambdas/functions/control-plane/src/aws/runners.ts +++ b/lambdas/functions/control-plane/src/aws/runners.ts @@ -103,7 +103,7 @@ function getRunnerInfo(runningInstances: DescribeInstancesResult) { } // AWS TerminateInstances accepts up to 1000 InstanceIds per call; we keep batches -// at 100 to bound payload + blast radius (a failed batch bisects, see terminateBatch). +// at 100 to bound payload + blast radius (a failed batch retries per id, see terminateBatch). const TERMINATE_BATCH_SIZE = 100; export async function terminateRunners(instanceIds: string[]): Promise { @@ -116,27 +116,18 @@ export async function terminateRunners(instanceIds: string[]): Promise { // Common path = 1 EC2 call for the whole batch. TerminateInstances is all-or-nothing // (one bad id throws InvalidInstanceID.NotFound and nothing is terminated), so on -// error we BISECT. This is recursive: each half re-enters terminateBatch and, if it -// also throws, bisects again — the recursion keeps halving until every batch is -// either terminated whole or down to a single id that is logged and skipped. A -// failure isolates each bad id in O(log n) levels while all good ids still terminate. +// error we retry each id individually — the bad one errors at size 1 and is logged. async function terminateBatch(ec2: EC2Client, batch: string[]): Promise { try { - const res = await ec2.send(new TerminateInstancesCommand({ InstanceIds: batch })); - const terminated = new Set((res.TerminatingInstances ?? []).map((i) => i.InstanceId)); - const missing = batch.filter((id) => !terminated.has(id)); - if (missing.length) logger.warn(`Terminate returned no state change for: ${missing.join(', ')}`); + await ec2.send(new TerminateInstancesCommand({ InstanceIds: batch })); logger.debug(`Runners terminated: ${batch.join(', ')}`); } catch (e) { if (batch.length === 1) { - // base case — isolated, surface and move on logger.error(`Failed to terminate runner '${batch[0]}'`, { error: e as Error }); return; } - const mid = Math.ceil(batch.length / 2); - logger.warn(`Batch terminate failed (${batch.length} ids), bisecting.`, { error: e as Error }); - await terminateBatch(ec2, batch.slice(0, mid)); // recurses; bisects again on repeat failure - await terminateBatch(ec2, batch.slice(mid)); + logger.warn(`Batch terminate failed (${batch.length} ids), retrying individually.`, { error: e as Error }); + for (const id of batch) await terminateBatch(ec2, [id]); } } From 16dc1f9aa1db3299401f22eda1b12f4fcd6209da Mon Sep 17 00:00:00 2001 From: Renaud Martinet Date: Tue, 23 Jun 2026 11:22:59 +0200 Subject: [PATCH 03/13] refactor: remove terminateRunner back-compat wrapper Single-id callers have been migrated to terminateRunners; the thin wrapper and its dedicated test are no longer needed. Co-Authored-By: Claude Sonnet 4.6 --- .../control-plane/src/aws/runners.test.ts | 21 +------------------ .../control-plane/src/aws/runners.ts | 5 ----- 2 files changed, 1 insertion(+), 25 deletions(-) diff --git a/lambdas/functions/control-plane/src/aws/runners.test.ts b/lambdas/functions/control-plane/src/aws/runners.test.ts index c2d889ae77..d09e6dc9ec 100644 --- a/lambdas/functions/control-plane/src/aws/runners.test.ts +++ b/lambdas/functions/control-plane/src/aws/runners.test.ts @@ -19,7 +19,7 @@ import 'aws-sdk-client-mock-jest/vitest'; import { beforeEach, describe, expect, it, vi } from 'vitest'; import ScaleError from './../scale-runners/ScaleError'; -import { createRunner, listEC2Runners, tag, terminateRunner, terminateRunners, untag } from './runners'; +import { createRunner, listEC2Runners, tag, terminateRunners, untag } from './runners'; import type { RunnerInfo, RunnerInputParameters, RunnerType } from './runners.d'; process.env.AWS_REGION = 'eu-east-1'; @@ -250,25 +250,6 @@ describe('list instances', () => { }); }); -describe('terminate runner', () => { - beforeEach(() => { - vi.clearAllMocks(); - }); - it('calls terminate instances with the right instance ids', async () => { - mockEC2Client.on(TerminateInstancesCommand).resolves({}); - const runner: RunnerInfo = { - instanceId: 'instance-2', - owner: 'owner-2', - type: 'Repo', - }; - await terminateRunner(runner.instanceId); - - expect(mockEC2Client).toHaveReceivedCommandWith(TerminateInstancesCommand, { - InstanceIds: [runner.instanceId], - }); - }); -}); - describe('terminateRunners', () => { beforeEach(() => { vi.clearAllMocks(); diff --git a/lambdas/functions/control-plane/src/aws/runners.ts b/lambdas/functions/control-plane/src/aws/runners.ts index 182cdb0000..d5a34ce083 100644 --- a/lambdas/functions/control-plane/src/aws/runners.ts +++ b/lambdas/functions/control-plane/src/aws/runners.ts @@ -131,11 +131,6 @@ async function terminateBatch(ec2: EC2Client, batch: string[]): Promise { } } -// Back-compat thin wrapper — existing single-id callers keep working. -export async function terminateRunner(instanceId: string): Promise { - await terminateRunners([instanceId]); -} - export async function tag(instanceId: string, tags: Tag[]): Promise { logger.debug(`Tagging '${instanceId}'`, { tags }); const ec2 = getTracedAWSV3Client(new EC2Client({ region: process.env.AWS_REGION })); From 7500bf371df9a88680999641e49fb8ed1a2c0dc1 Mon Sep 17 00:00:00 2001 From: Renaud Martinet Date: Tue, 23 Jun 2026 11:25:55 +0200 Subject: [PATCH 04/13] fix(scale-up): migrate remaining terminateRunner caller to terminateRunners The back-compat wrapper was removed in the previous commit but the call site in createRunners() was missed. Wraps the single instanceId in an array to match the new batch signature. Co-Authored-By: Claude Sonnet 4.6 --- lambdas/functions/control-plane/src/scale-runners/scale-up.ts | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lambdas/functions/control-plane/src/scale-runners/scale-up.ts b/lambdas/functions/control-plane/src/scale-runners/scale-up.ts index 8ebd8810d8..a970b25dd8 100644 --- a/lambdas/functions/control-plane/src/scale-runners/scale-up.ts +++ b/lambdas/functions/control-plane/src/scale-runners/scale-up.ts @@ -10,7 +10,7 @@ import { getAppCount, getStoredInstallationId, } from '../github/auth'; -import { createRunner, listEC2Runners, tag, terminateRunner } from './../aws/runners'; +import { createRunner, listEC2Runners, tag, terminateRunners } from './../aws/runners'; import { RunnerInputParameters } from './../aws/runners.d'; import { metricGitHubAppRateLimit } from '../github/rate-limit'; import { publishRetryMessage } from './job-retry'; @@ -281,7 +281,7 @@ export async function createRunners( for (const instanceId of failedInstances) { try { - await terminateRunner(instanceId); + await terminateRunners([instanceId]); } catch (error) { logger.error('Failed to terminate instance', { instanceId, From db691f70d3aaa20d045b1901d01b3ed553e8ec0c Mon Sep 17 00:00:00 2001 From: Renaud Martinet Date: Tue, 23 Jun 2026 11:45:02 +0200 Subject: [PATCH 05/13] fix(scale-down): isolate per-runner errors in terminateOrphan loop Deferred batching means a throw inside the loop abandons the entire toTerminate array. Wrap each loop iteration in try/catch so a failing lastChanceCheckOrphanRunner call is logged and skipped, and the batch flush always runs for the runners that were successfully evaluated. Co-Authored-By: Claude Sonnet 4.6 --- .../src/scale-runners/scale-down.ts | 20 +++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/lambdas/functions/control-plane/src/scale-runners/scale-down.ts b/lambdas/functions/control-plane/src/scale-runners/scale-down.ts index 09426b0094..ac7ab0634f 100644 --- a/lambdas/functions/control-plane/src/scale-runners/scale-down.ts +++ b/lambdas/functions/control-plane/src/scale-runners/scale-down.ts @@ -295,16 +295,20 @@ async function terminateOrphan(environment: string): Promise { const toTerminate: string[] = []; for (const runner of orphanRunners) { - if (runner.runnerId) { - const isOrphan = await lastChanceCheckOrphanRunner(runner); - if (isOrphan) { - toTerminate.push(runner.instanceId); + try { + if (runner.runnerId) { + const isOrphan = await lastChanceCheckOrphanRunner(runner); + if (isOrphan) { + toTerminate.push(runner.instanceId); + } else { + await unMarkOrphan(runner.instanceId); + } } else { - await unMarkOrphan(runner.instanceId); + logger.info(`Terminating orphan runner '${runner.instanceId}'`); + toTerminate.push(runner.instanceId); } - } else { - logger.info(`Terminating orphan runner '${runner.instanceId}'`); - toTerminate.push(runner.instanceId); + } catch (e) { + logger.error(`Failed to evaluate orphan runner '${runner.instanceId}', skipping.`, { error: e as Error }); } } From 6d5006ba70ac2c989c80628e6dba48bf3b3bece3 Mon Sep 17 00:00:00 2001 From: Renaud Martinet Date: Tue, 23 Jun 2026 11:45:20 +0200 Subject: [PATCH 06/13] fix(scale-down): move getOrCreateOctokit inside removeRunner's try/catch getOrCreateOctokit was called before the try block, so an auth failure escaped removeRunner entirely instead of returning undefined. This broke removeRunner's documented contract (returns undefined on any failure) and caused evaluateAndRemoveRunners to abort its loop, abandoning the entire toTerminate batch for subsequent runners. Co-Authored-By: Claude Sonnet 4.6 --- lambdas/functions/control-plane/src/scale-runners/scale-down.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lambdas/functions/control-plane/src/scale-runners/scale-down.ts b/lambdas/functions/control-plane/src/scale-runners/scale-down.ts index ac7ab0634f..08f3e715fa 100644 --- a/lambdas/functions/control-plane/src/scale-runners/scale-down.ts +++ b/lambdas/functions/control-plane/src/scale-runners/scale-down.ts @@ -140,8 +140,8 @@ function runnerMinimumTimeExceeded(runner: RunnerInfo): boolean { // Returns the instanceId when the runner was successfully de-registered from GitHub, // so the caller can batch-terminate in one EC2 call. Returns undefined otherwise. async function removeRunner(ec2runner: RunnerInfo, ghRunnerIds: number[]): Promise { - const githubAppClient = await getOrCreateOctokit(ec2runner); try { + const githubAppClient = await getOrCreateOctokit(ec2runner); const runnerList = ec2runner as unknown as RunnerList; if (runnerList.bypassRemoval) { logger.info( From b05380c4d0ba39a14c2bf422a738622a5c442b0c Mon Sep 17 00:00:00 2001 From: Renaud Martinet Date: Tue, 23 Jun 2026 11:45:50 +0200 Subject: [PATCH 07/13] test(scale-down): restore exact-set assertion in checkTerminated MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The migration from terminateRunner to terminateRunners weakened checkTerminated from toHaveBeenCalledTimes(n) + per-id check to per-id toContain — duplicate or spurious termination calls would pass undetected. Restore the equivalent guarantee with sorted-array equality. Co-Authored-By: Claude Sonnet 4.6 --- .../control-plane/src/scale-runners/scale-down.test.ts | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/lambdas/functions/control-plane/src/scale-runners/scale-down.test.ts b/lambdas/functions/control-plane/src/scale-runners/scale-down.test.ts index 77fb44b8ea..8192604615 100644 --- a/lambdas/functions/control-plane/src/scale-runners/scale-down.test.ts +++ b/lambdas/functions/control-plane/src/scale-runners/scale-down.test.ts @@ -821,9 +821,7 @@ function checkTerminated(runners: RunnerTestItem[]) { const idsToTerminate = runners.filter((r) => r.shouldBeTerminated).map((r) => r.instanceId); if (idsToTerminate.length === 0) return; const allCalledIds = mockTerminateRunners.mock.calls.flatMap((call) => call[0] as string[]); - for (const id of idsToTerminate) { - expect(allCalledIds).toContain(id); - } + expect([...allCalledIds].sort()).toEqual([...idsToTerminate].sort()); } function mockGitHubRunners(runners: RunnerTestItem[]) { From 8f944075c6beb48f56e1cd5bcec33f6750fafc26 Mon Sep 17 00:00:00 2001 From: Renaud Martinet Date: Tue, 23 Jun 2026 15:07:53 +0200 Subject: [PATCH 08/13] =?UTF-8?q?refactor(scale-down):=20rename=20removeRu?= =?UTF-8?q?nner=20=E2=86=92=20deregisterRunner?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The function only de-registers the GitHub runner; EC2 termination is now batched by the caller. The old name implied full removal (de-register + EC2 terminate), which is no longer accurate. Co-Authored-By: Claude Opus 4.8 --- .../functions/control-plane/src/scale-runners/scale-down.ts | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lambdas/functions/control-plane/src/scale-runners/scale-down.ts b/lambdas/functions/control-plane/src/scale-runners/scale-down.ts index 08f3e715fa..535511780b 100644 --- a/lambdas/functions/control-plane/src/scale-runners/scale-down.ts +++ b/lambdas/functions/control-plane/src/scale-runners/scale-down.ts @@ -139,7 +139,7 @@ function runnerMinimumTimeExceeded(runner: RunnerInfo): boolean { // Returns the instanceId when the runner was successfully de-registered from GitHub, // so the caller can batch-terminate in one EC2 call. Returns undefined otherwise. -async function removeRunner(ec2runner: RunnerInfo, ghRunnerIds: number[]): Promise { +async function deregisterRunner(ec2runner: RunnerInfo, ghRunnerIds: number[]): Promise { try { const githubAppClient = await getOrCreateOctokit(ec2runner); const runnerList = ec2runner as unknown as RunnerList; @@ -227,7 +227,7 @@ async function evaluateAndRemoveRunners( logger.info(`Runner '${ec2Runner.instanceId}' will be kept idle.`); } else { logger.info(`Terminating all non busy runners.`); - const id = await removeRunner( + const id = await deregisterRunner( ec2Runner, ghRunnersFiltered.map((runner: { id: number }) => runner.id), ); From 09e4c5d7d8f665579e1643da345849eb54484c72 Mon Sep 17 00:00:00 2001 From: Renaud Martinet Date: Tue, 23 Jun 2026 15:08:19 +0200 Subject: [PATCH 09/13] fix(scale-down): move 'queued for termination' log to the caller deregisterRunner only de-registers; the queueing (push to toTerminate) happens in evaluateAndRemoveRunners. Log each step where it actually occurs: de-registration stays in deregisterRunner, queueing log moves to the push site. Co-Authored-By: Claude Opus 4.8 --- .../control-plane/src/scale-runners/scale-down.ts | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/lambdas/functions/control-plane/src/scale-runners/scale-down.ts b/lambdas/functions/control-plane/src/scale-runners/scale-down.ts index 535511780b..aa90d7e74b 100644 --- a/lambdas/functions/control-plane/src/scale-runners/scale-down.ts +++ b/lambdas/functions/control-plane/src/scale-runners/scale-down.ts @@ -176,7 +176,7 @@ async function deregisterRunner(ec2runner: RunnerInfo, ghRunnerIds: number[]): P ); if (statuses.every((status) => status == 204)) { - logger.info(`GitHub runner de-registered for '${ec2runner.instanceId}', queued for termination.`); + logger.info(`GitHub runner de-registered for '${ec2runner.instanceId}'.`); return ec2runner.instanceId; } else { logger.error(`Failed to de-register GitHub runner: ${statuses}`); @@ -231,7 +231,10 @@ async function evaluateAndRemoveRunners( ec2Runner, ghRunnersFiltered.map((runner: { id: number }) => runner.id), ); - if (id) toTerminate.push(id); + if (id) { + toTerminate.push(id); + logger.info(`Runner '${id}' queued for termination.`); + } } } } else if (bootTimeExceeded(ec2Runner)) { From 3cb32a09f9af4b067126624c5e07d22c5c46a7c8 Mon Sep 17 00:00:00 2001 From: Renaud Martinet Date: Tue, 23 Jun 2026 15:08:42 +0200 Subject: [PATCH 10/13] fix(scripts): strip --from= prefix from Window label in check-terminate-quota.sh Printed 'Window: --from=2h' instead of 'Window: 2h'. Strip the flag prefix with bash parameter expansion when echoing. Co-Authored-By: Claude Opus 4.8 --- scripts/check-terminate-quota.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/check-terminate-quota.sh b/scripts/check-terminate-quota.sh index 962ae9bdda..64353d9ccb 100755 --- a/scripts/check-terminate-quota.sh +++ b/scripts/check-terminate-quota.sh @@ -32,7 +32,7 @@ BUCKET1="${TERMINATE}\ +sum:aws.usage.call_count.sum{service:ec2,resource:runinstances}.rollup(max,60)\ +sum:aws.usage.call_count.sum{service:ec2,resource:deletenetworkinterfaces}.rollup(max,60)" -echo "Window: $FROM" +echo "Window: ${FROM#--from=}" echo "" printf "%-30s %s\n" "TerminateInstances alone:" "$(q "$TERMINATE")" printf "%-30s %s (sustained limit: 540)\n" "Bucket-1 total:" "$(q "$BUCKET1")" From 001dde9779594d7aef9cb8fab2b472fc7b01db62 Mon Sep 17 00:00:00 2001 From: Renaud Martinet Date: Tue, 23 Jun 2026 15:35:05 +0200 Subject: [PATCH 11/13] test(scale-down): assert terminateRunners not called when no terminations expected checkTerminated returned early when idsToTerminate was empty, so tests expecting zero terminations never verified terminateRunners stayed uncalled. Remove the guard: empty expected set now asserts allCalledIds == [], catching any spurious termination call. Co-Authored-By: Claude Opus 4.8 --- .../functions/control-plane/src/scale-runners/scale-down.test.ts | 1 - 1 file changed, 1 deletion(-) diff --git a/lambdas/functions/control-plane/src/scale-runners/scale-down.test.ts b/lambdas/functions/control-plane/src/scale-runners/scale-down.test.ts index 8192604615..0aa577889f 100644 --- a/lambdas/functions/control-plane/src/scale-runners/scale-down.test.ts +++ b/lambdas/functions/control-plane/src/scale-runners/scale-down.test.ts @@ -819,7 +819,6 @@ function checkNonTerminated(runners: RunnerTestItem[]) { function checkTerminated(runners: RunnerTestItem[]) { const idsToTerminate = runners.filter((r) => r.shouldBeTerminated).map((r) => r.instanceId); - if (idsToTerminate.length === 0) return; const allCalledIds = mockTerminateRunners.mock.calls.flatMap((call) => call[0] as string[]); expect([...allCalledIds].sort()).toEqual([...idsToTerminate].sort()); } From f1af786021b32a4af41b8a21d8b2baefe39b2a95 Mon Sep 17 00:00:00 2001 From: Renaud Martinet Date: Tue, 23 Jun 2026 15:35:33 +0200 Subject: [PATCH 12/13] =?UTF-8?q?fix(scale-down):=20clarify=20idle-limit?= =?UTF-8?q?=20log=20=E2=80=94=20de-registering,=20not=20terminating?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 'Terminating all non busy runners.' logged before deregisterRunner, before any EC2 call. Reword to 'Runner X exceeds idle limit, de-registering.' so the log matches what happens at that line. EC2 termination is logged later when the deferred batch is flushed. Co-Authored-By: Claude Opus 4.8 --- lambdas/functions/control-plane/src/scale-runners/scale-down.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lambdas/functions/control-plane/src/scale-runners/scale-down.ts b/lambdas/functions/control-plane/src/scale-runners/scale-down.ts index aa90d7e74b..785467b670 100644 --- a/lambdas/functions/control-plane/src/scale-runners/scale-down.ts +++ b/lambdas/functions/control-plane/src/scale-runners/scale-down.ts @@ -226,7 +226,7 @@ async function evaluateAndRemoveRunners( idleCounter--; logger.info(`Runner '${ec2Runner.instanceId}' will be kept idle.`); } else { - logger.info(`Terminating all non busy runners.`); + logger.info(`Runner '${ec2Runner.instanceId}' exceeds idle limit, de-registering.`); const id = await deregisterRunner( ec2Runner, ghRunnersFiltered.map((runner: { id: number }) => runner.id), From b2e705e9b4d77839426a3a37ba970444d200f68d Mon Sep 17 00:00:00 2001 From: Renaud Martinet Date: Tue, 23 Jun 2026 15:35:56 +0200 Subject: [PATCH 13/13] =?UTF-8?q?fix(scale-down):=20clarify=20orphan=20log?= =?UTF-8?q?=20=E2=80=94=20queuing,=20not=20terminating?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 'Terminating orphan runner X' logged when the code only pushes to toTerminate. Actual EC2 termination happens after the loop. Reword to 'Queuing orphan runner X for termination.' to match what the line actually does. Co-Authored-By: Claude Opus 4.8 --- lambdas/functions/control-plane/src/scale-runners/scale-down.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lambdas/functions/control-plane/src/scale-runners/scale-down.ts b/lambdas/functions/control-plane/src/scale-runners/scale-down.ts index 785467b670..4f0386180e 100644 --- a/lambdas/functions/control-plane/src/scale-runners/scale-down.ts +++ b/lambdas/functions/control-plane/src/scale-runners/scale-down.ts @@ -307,7 +307,7 @@ async function terminateOrphan(environment: string): Promise { await unMarkOrphan(runner.instanceId); } } else { - logger.info(`Terminating orphan runner '${runner.instanceId}'`); + logger.info(`Queuing orphan runner '${runner.instanceId}' for termination.`); toTerminate.push(runner.instanceId); } } catch (e) {