Skip to content

Commit 2a53b4a

Browse files
fix(ec2): address third round of review comments
1. Add rollback on verify failure: if DescribeInstances throws during the tag-then-verify claim, roll back the busy/task-id tags so the instance isn't stuck. 2. Use docker container prune instead of docker system prune in cleanup trap to preserve cached images and avoid re-pulling on next task. 3. Add ecr:BatchCheckLayerAvailability to instance role ECR permissions — required for docker pull from ECR. 4. InvocationDoesNotExist now rethrows instead of returning failed, letting the orchestrator's consecutiveComputePollFailures counter handle transient propagation delays (fails after 3 consecutive).
1 parent b6caa02 commit 2a53b4a

3 files changed

Lines changed: 37 additions & 16 deletions

File tree

cdk/src/constructs/ec2-agent-fleet.ts

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -125,6 +125,7 @@ export class Ec2AgentFleet extends Construct {
125125
this.instanceRole.addToPrincipalPolicy(new iam.PolicyStatement({
126126
actions: [
127127
'ecr:BatchGetImage',
128+
'ecr:BatchCheckLayerAvailability',
128129
'ecr:GetDownloadUrlForLayer',
129130
],
130131
resources: [props.agentImageAsset.repository.repositoryArn],

cdk/src/handlers/shared/strategies/ec2-strategy.ts

Lines changed: 34 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -111,22 +111,42 @@ export class Ec2ComputeStrategy implements ComputeStrategy {
111111
],
112112
}));
113113

114-
// 3b. Re-describe to verify we won the race
115-
const verifyResult = await getEc2Client().send(new DescribeInstancesCommand({
116-
InstanceIds: [candidateId],
117-
}));
118-
const verifiedInstance = verifyResult.Reservations?.[0]?.Instances?.[0];
119-
const taskIdTag = verifiedInstance?.Tags?.find(t => t.Key === 'bgagent:task-id');
114+
// 3b. Re-describe to verify we won the race. If the verify call itself
115+
// fails (throttle, network), roll back the tags so the instance isn't
116+
// stuck as busy, then propagate the error.
117+
let verified = false;
118+
try {
119+
const verifyResult = await getEc2Client().send(new DescribeInstancesCommand({
120+
InstanceIds: [candidateId],
121+
}));
122+
const verifiedInstance = verifyResult.Reservations?.[0]?.Instances?.[0];
123+
const taskIdTag = verifiedInstance?.Tags?.find(t => t.Key === 'bgagent:task-id');
124+
verified = taskIdTag?.Value === taskId;
125+
} catch (verifyErr) {
126+
// Best-effort rollback before propagating
127+
try {
128+
await getEc2Client().send(new CreateTagsCommand({
129+
Resources: [candidateId],
130+
Tags: [{ Key: 'bgagent:status', Value: 'idle' }],
131+
}));
132+
await getEc2Client().send(new DeleteTagsCommand({
133+
Resources: [candidateId],
134+
Tags: [{ Key: 'bgagent:task-id' }],
135+
}));
136+
} catch {
137+
logger.warn('Failed to rollback instance tags after verify failure', { instance_id: candidateId, task_id: taskId });
138+
}
139+
throw verifyErr;
140+
}
120141

121-
if (taskIdTag?.Value === taskId) {
142+
if (verified) {
122143
instanceId = candidateId;
123144
break;
124145
}
125146

126147
logger.warn('Lost instance claim race, trying next candidate', {
127148
task_id: taskId,
128149
instance_id: candidateId,
129-
claimed_by: taskIdTag?.Value,
130150
});
131151
}
132152

@@ -151,7 +171,7 @@ export class Ec2ComputeStrategy implements ComputeStrategy {
151171
'',
152172
'# Cleanup trap — always retag instance as idle on exit (success, error, or signal)',
153173
'cleanup() {',
154-
' docker system prune -f || true',
174+
' docker container prune -f || true',
155175
' rm -f /tmp/payload.json',
156176
' aws ec2 create-tags --resources "$INSTANCE_ID" --region "$AWS_REGION" --tags Key=bgagent:status,Value=idle || true',
157177
' aws ec2 delete-tags --resources "$INSTANCE_ID" --region "$AWS_REGION" --tags Key=bgagent:task-id || true',
@@ -252,10 +272,11 @@ export class Ec2ComputeStrategy implements ComputeStrategy {
252272
return { status: 'running' };
253273
}
254274
} catch (err) {
255-
const errName = err instanceof Error ? err.name : undefined;
256-
if (errName === 'InvocationDoesNotExist') {
257-
return { status: 'failed', error: 'SSM command invocation not found' };
258-
}
275+
// InvocationDoesNotExist can occur transiently while the command is
276+
// still propagating to the instance. Rethrow so the orchestrator's
277+
// consecutiveComputePollFailures counter handles it — the command
278+
// will be retried on the next poll cycle and only failed after 3
279+
// consecutive errors.
259280
throw err;
260281
}
261282
}

cdk/test/handlers/shared/strategies/ec2-strategy.test.ts

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -287,14 +287,13 @@ describe('Ec2ComputeStrategy', () => {
287287
expect(result).toEqual({ status: 'running' });
288288
});
289289

290-
test('returns failed when InvocationDoesNotExist', async () => {
290+
test('throws InvocationDoesNotExist so orchestrator retry counter handles it', async () => {
291291
const err = new Error('Invocation does not exist');
292292
err.name = 'InvocationDoesNotExist';
293293
mockSsmSend.mockRejectedValueOnce(err);
294294

295295
const strategy = new Ec2ComputeStrategy();
296-
const result = await strategy.pollSession(makeHandle());
297-
expect(result).toEqual({ status: 'failed', error: 'SSM command invocation not found' });
296+
await expect(strategy.pollSession(makeHandle())).rejects.toThrow('Invocation does not exist');
298297
});
299298

300299
test('throws when handle is not ec2 type', async () => {

0 commit comments

Comments
 (0)