fix(ec2): address TOCTOU race and heartbeat false-positive for EC2 tasks

MichaelWalker-git · MichaelWalker-git · commit 6420ed977961 · 2026-04-14T14:22:12.000-07:00
1. TOCTOU race in instance selection: after tagging an instance as busy,
   re-describe to verify our task-id stuck. If another orchestrator won
   the race, try the next idle candidate instead of double-dispatching.

2. Heartbeat false-positive: EC2/ECS tasks invoke run_task() directly
   and may not send continuous heartbeats. Suppress sessionUnhealthy
   checks when compute-level crash detection (pollSession) is active,
   preventing premature task failure after ~6 minutes.

3. SSM Cancelling status: map to 'running' (transient) instead of
   'failed' to avoid premature failure while cancel propagates.

4. Fix babel parse errors in test mocks (remove `: unknown` annotations
   from jest.mock factory callbacks).
diff --git a/cdk/src/handlers/orchestrate-task.ts b/cdk/src/handlers/orchestrate-task.ts
@@ -231,15 +231,22 @@ const durableHandler: DurableExecutionHandler<OrchestrateTaskEvent, void> = asyn
         }
       }
 
-      return { ...ddbState, consecutiveComputePollFailures, consecutiveComputeCompletedPolls };
+      // For ECS/EC2 tasks, suppress heartbeat-based sessionUnhealthy since those
+      // backends have compute-level crash detection and may not send heartbeats.
+      const suppressHeartbeat = computeStrategy ? { sessionUnhealthy: false } : {};
+      return { ...ddbState, ...suppressHeartbeat, consecutiveComputePollFailures, consecutiveComputeCompletedPolls };
     },
     {
       initialState: { attempts: 0 },
       waitStrategy: (state: PollState) => {
         if (state.lastStatus && TERMINAL_STATUSES.includes(state.lastStatus)) {
           return { shouldContinue: false };
         }
-        if (state.sessionUnhealthy) {
+        // Heartbeat-based health checks only apply to AgentCore tasks.
+        // ECS/EC2 tasks have compute-level crash detection (pollSession) in the
+        // poll callback, so stale heartbeats should not terminate polling early
+        // — the agent entrypoint on those backends may not send continuous heartbeats.
+        if (state.sessionUnhealthy && !computeStrategy) {
           return { shouldContinue: false };
         }
         if (state.attempts >= MAX_POLL_ATTEMPTS) {
diff --git a/cdk/src/handlers/shared/strategies/ec2-strategy.ts b/cdk/src/handlers/shared/strategies/ec2-strategy.ts
@@ -80,7 +80,10 @@ export class Ec2ComputeStrategy implements ComputeStrategy {
       ContentType: 'application/json',
     }));
 
-    // 2. Find an idle instance
+    // 2. Find an idle instance and claim it atomically via tag-then-verify.
+    // Multiple orchestrators may race for the same instance, so after tagging
+    // we re-describe to confirm our task-id stuck. If another invocation
+    // overwrote the tag, we try the next candidate.
     const describeResult = await getEc2Client().send(new DescribeInstancesCommand({
       Filters: [
         { Name: `tag:${EC2_FLEET_TAG_KEY}`, Values: [EC2_FLEET_TAG_VALUE] },
@@ -89,21 +92,47 @@ export class Ec2ComputeStrategy implements ComputeStrategy {
       ],
     }));
 
-    const instances = (describeResult.Reservations ?? []).flatMap(r => r.Instances ?? []);
-    if (instances.length === 0 || !instances[0]?.InstanceId) {
+    const candidates = (describeResult.Reservations ?? []).flatMap(r => r.Instances ?? []);
+    if (candidates.length === 0) {
       throw new Error('No idle EC2 instances available in fleet');
     }
 
-    const instanceId = instances[0].InstanceId;
+    let instanceId: string | undefined;
+    for (const candidate of candidates) {
+      const candidateId = candidate.InstanceId;
+      if (!candidateId) continue;
 
-    // 3. Tag instance as busy
-    await getEc2Client().send(new CreateTagsCommand({
-      Resources: [instanceId],
-      Tags: [
-        { Key: 'bgagent:status', Value: 'busy' },
-        { Key: 'bgagent:task-id', Value: taskId },
-      ],
-    }));
+      // 3a. Tag instance as busy with our task-id
+      await getEc2Client().send(new CreateTagsCommand({
+        Resources: [candidateId],
+        Tags: [
+          { Key: 'bgagent:status', Value: 'busy' },
+          { Key: 'bgagent:task-id', Value: taskId },
+        ],
+      }));
+
+      // 3b. Re-describe to verify we won the race
+      const verifyResult = await getEc2Client().send(new DescribeInstancesCommand({
+        InstanceIds: [candidateId],
+      }));
+      const verifiedInstance = verifyResult.Reservations?.[0]?.Instances?.[0];
+      const taskIdTag = verifiedInstance?.Tags?.find(t => t.Key === 'bgagent:task-id');
+
+      if (taskIdTag?.Value === taskId) {
+        instanceId = candidateId;
+        break;
+      }
+
+      logger.warn('Lost instance claim race, trying next candidate', {
+        task_id: taskId,
+        instance_id: candidateId,
+        claimed_by: taskIdTag?.Value,
+      });
+    }
+
+    if (!instanceId) {
+      throw new Error('No idle EC2 instances available in fleet (all candidates claimed by other tasks)');
+    }
 
     // 4. Build the boot script
     // All task data is read from the S3 payload at runtime to avoid shell
@@ -209,13 +238,13 @@ export class Ec2ComputeStrategy implements ComputeStrategy {
         case 'InProgress':
         case 'Pending':
         case 'Delayed':
+        case 'Cancelling': // transient — command still running while cancel propagates
           return { status: 'running' };
         case 'Success':
           return { status: 'completed' };
         case 'Failed':
         case 'Cancelled':
         case 'TimedOut':
-        case 'Cancelling':
           return { status: 'failed', error: result.StatusDetails ?? `SSM command ${status}` };
         default:
           // Covers any unexpected status values — treat as running to avoid
diff --git a/cdk/test/handlers/shared/strategies/ec2-strategy.test.ts b/cdk/test/handlers/shared/strategies/ec2-strategy.test.ts
@@ -33,23 +33,23 @@ process.env.ECR_IMAGE_URI = ECR_IMAGE;
 const mockEc2Send = jest.fn();
 jest.mock('@aws-sdk/client-ec2', () => ({
   EC2Client: jest.fn(() => ({ send: mockEc2Send })),
-  DescribeInstancesCommand: jest.fn((input: unknown) => ({ _type: 'DescribeInstances', input })),
-  CreateTagsCommand: jest.fn((input: unknown) => ({ _type: 'CreateTags', input })),
-  DeleteTagsCommand: jest.fn((input: unknown) => ({ _type: 'DeleteTags', input })),
+  DescribeInstancesCommand: jest.fn((input) => ({ _type: 'DescribeInstances', input })),
+  CreateTagsCommand: jest.fn((input) => ({ _type: 'CreateTags', input })),
+  DeleteTagsCommand: jest.fn((input) => ({ _type: 'DeleteTags', input })),
 }));
 
 const mockSsmSend = jest.fn();
 jest.mock('@aws-sdk/client-ssm', () => ({
   SSMClient: jest.fn(() => ({ send: mockSsmSend })),
-  SendCommandCommand: jest.fn((input: unknown) => ({ _type: 'SendCommand', input })),
-  GetCommandInvocationCommand: jest.fn((input: unknown) => ({ _type: 'GetCommandInvocation', input })),
-  CancelCommandCommand: jest.fn((input: unknown) => ({ _type: 'CancelCommand', input })),
+  SendCommandCommand: jest.fn((input) => ({ _type: 'SendCommand', input })),
+  GetCommandInvocationCommand: jest.fn((input) => ({ _type: 'GetCommandInvocation', input })),
+  CancelCommandCommand: jest.fn((input) => ({ _type: 'CancelCommand', input })),
 }));
 
 const mockS3Send = jest.fn();
 jest.mock('@aws-sdk/client-s3', () => ({
   S3Client: jest.fn(() => ({ send: mockS3Send })),
-  PutObjectCommand: jest.fn((input: unknown) => ({ _type: 'PutObject', input })),
+  PutObjectCommand: jest.fn((input) => ({ _type: 'PutObject', input })),
 }));
 
 import { Ec2ComputeStrategy } from '../../../../src/handlers/shared/strategies/ec2-strategy';
@@ -65,7 +65,7 @@ describe('Ec2ComputeStrategy', () => {
   });
 
   describe('startSession', () => {
-    test('finds idle instance, tags as busy, uploads to S3, sends SSM command, returns handle', async () => {
+    test('finds idle instance, tags as busy, verifies claim, uploads to S3, sends SSM command, returns handle', async () => {
       // S3 upload
       mockS3Send.mockResolvedValueOnce({});
       // DescribeInstances — return one idle instance
@@ -74,6 +74,10 @@ describe('Ec2ComputeStrategy', () => {
       });
       // CreateTags (mark busy)
       mockEc2Send.mockResolvedValueOnce({});
+      // DescribeInstances — verify claim (tag matches our task-id)
+      mockEc2Send.mockResolvedValueOnce({
+        Reservations: [{ Instances: [{ InstanceId: INSTANCE_ID, Tags: [{ Key: 'bgagent:task-id', Value: 'TASK001' }] }] }],
+      });
       // SSM SendCommand
       mockSsmSend.mockResolvedValueOnce({
         Command: { CommandId: COMMAND_ID },
@@ -98,8 +102,8 @@ describe('Ec2ComputeStrategy', () => {
       expect(s3Call.input.Bucket).toBe(PAYLOAD_BUCKET);
       expect(s3Call.input.Key).toBe('tasks/TASK001/payload.json');
 
-      // Verify DescribeInstances filter
-      expect(mockEc2Send).toHaveBeenCalledTimes(2);
+      // Verify EC2 calls: DescribeInstances (find idle), CreateTags (claim), DescribeInstances (verify)
+      expect(mockEc2Send).toHaveBeenCalledTimes(3);
       const describeCall = mockEc2Send.mock.calls[0][0];
       expect(describeCall.input.Filters).toEqual(expect.arrayContaining([
         expect.objectContaining({ Name: `tag:${FLEET_TAG_KEY}`, Values: [FLEET_TAG_VALUE] }),
@@ -123,6 +127,43 @@ describe('Ec2ComputeStrategy', () => {
       expect(ssmCall.input.TimeoutSeconds).toBe(32400);
     });
 
+    test('tries next candidate when race is lost on first instance', async () => {
+      const INSTANCE_ID_2 = 'i-0987654321fedcba0';
+      // S3 upload
+      mockS3Send.mockResolvedValueOnce({});
+      // DescribeInstances — return two idle instances
+      mockEc2Send.mockResolvedValueOnce({
+        Reservations: [{ Instances: [{ InstanceId: INSTANCE_ID }, { InstanceId: INSTANCE_ID_2 }] }],
+      });
+      // CreateTags on first instance
+      mockEc2Send.mockResolvedValueOnce({});
+      // Verify first instance — another task claimed it
+      mockEc2Send.mockResolvedValueOnce({
+        Reservations: [{ Instances: [{ InstanceId: INSTANCE_ID, Tags: [{ Key: 'bgagent:task-id', Value: 'OTHER_TASK' }] }] }],
+      });
+      // CreateTags on second instance
+      mockEc2Send.mockResolvedValueOnce({});
+      // Verify second instance — our task-id stuck
+      mockEc2Send.mockResolvedValueOnce({
+        Reservations: [{ Instances: [{ InstanceId: INSTANCE_ID_2, Tags: [{ Key: 'bgagent:task-id', Value: 'TASK001' }] }] }],
+      });
+      // SSM SendCommand
+      mockSsmSend.mockResolvedValueOnce({
+        Command: { CommandId: COMMAND_ID },
+      });
+
+      const strategy = new Ec2ComputeStrategy();
+      const handle = await strategy.startSession({
+        taskId: 'TASK001',
+        payload: { repo_url: 'org/repo' },
+        blueprintConfig: { compute_type: 'ec2', runtime_arn: '' },
+      });
+
+      const ec2Handle = handle as Extract<typeof handle, { strategyType: 'ec2' }>;
+      expect(ec2Handle.instanceId).toBe(INSTANCE_ID_2);
+      expect(mockEc2Send).toHaveBeenCalledTimes(5); // describe + 2*(tag + verify)
+    });
+
     test('throws when no idle instances available', async () => {
       // S3 upload
       mockS3Send.mockResolvedValueOnce({});
@@ -148,6 +189,10 @@ describe('Ec2ComputeStrategy', () => {
       });
       // CreateTags
       mockEc2Send.mockResolvedValueOnce({});
+      // DescribeInstances — verify claim
+      mockEc2Send.mockResolvedValueOnce({
+        Reservations: [{ Instances: [{ InstanceId: INSTANCE_ID, Tags: [{ Key: 'bgagent:task-id', Value: 'TASK001' }] }] }],
+      });
       // SSM SendCommand — return no CommandId
       mockSsmSend.mockResolvedValueOnce({ Command: {} });
 
@@ -226,12 +271,12 @@ describe('Ec2ComputeStrategy', () => {
       expect(result).toEqual({ status: 'failed', error: 'Command timed out' });
     });
 
-    test('returns failed for Cancelling status', async () => {
+    test('returns running for Cancelling status (transient)', async () => {
       mockSsmSend.mockResolvedValueOnce({ Status: 'Cancelling', StatusDetails: 'Command is being cancelled' });
 
       const strategy = new Ec2ComputeStrategy();
       const result = await strategy.pollSession(makeHandle());
-      expect(result).toEqual({ status: 'failed', error: 'Command is being cancelled' });
+      expect(result).toEqual({ status: 'running' });
     });
 
     test('returns running for unknown status (default case)', async () => {

Original file line number	Diff line number	Diff line change
`@@ -231,15 +231,22 @@ const durableHandler: DurableExecutionHandler<OrchestrateTaskEvent, void> = asyn`
`231`	`231`	`}`
`232`	`232`	`}`
`233`	`233`
`234`		`- return { ...ddbState, consecutiveComputePollFailures, consecutiveComputeCompletedPolls };`
	`234`	`+ // For ECS/EC2 tasks, suppress heartbeat-based sessionUnhealthy since those`
	`235`	`+ // backends have compute-level crash detection and may not send heartbeats.`
	`236`	`+ const suppressHeartbeat = computeStrategy ? { sessionUnhealthy: false } : {};`
	`237`	`+ return { ...ddbState, ...suppressHeartbeat, consecutiveComputePollFailures, consecutiveComputeCompletedPolls };`
`235`	`238`	`},`
`236`	`239`	`{`
`237`	`240`	`initialState: { attempts: 0 },`
`238`	`241`	`waitStrategy: (state: PollState) => {`
`239`	`242`	`if (state.lastStatus && TERMINAL_STATUSES.includes(state.lastStatus)) {`
`240`	`243`	`return { shouldContinue: false };`
`241`	`244`	`}`
`242`		`- if (state.sessionUnhealthy) {`
	`245`	`+ // Heartbeat-based health checks only apply to AgentCore tasks.`
	`246`	`+ // ECS/EC2 tasks have compute-level crash detection (pollSession) in the`
	`247`	`+ // poll callback, so stale heartbeats should not terminate polling early`
	`248`	`+ // — the agent entrypoint on those backends may not send continuous heartbeats.`
	`249`	`+ if (state.sessionUnhealthy && !computeStrategy) {`
`243`	`250`	`return { shouldContinue: false };`
`244`	`251`	`}`
`245`	`252`	`if (state.attempts >= MAX_POLL_ATTEMPTS) {`