Skip to content

Commit 42566d0

Browse files
chore(pr): address Copilot review feedback
- Remove unnecessary iam:PassRole from orchestrator (EC2 strategy never passes a role to any API) - Simplify ec2FleetConfig in task-api to empty object (instanceRoleArn was unused) - Use CDK Tags.of() for ASG fleet tag propagation instead of no-op user-data tagging — instances are now tagged at launch - Fix missing AWS_REGION in boot script by deriving from IMDS - Eliminate shell injection risk by reading all task data from S3 payload at runtime instead of interpolating into bash exports - Add cleanup trap in boot script to always retag instance as idle on exit (success, error, or signal) - Add try/catch rollback in startSession to retag instance as idle when SSM dispatch fails - Generalize ECS-specific log messages in poll loop to be compute-backend-agnostic (uses strategy type label)
1 parent bdb89f4 commit 42566d0

7 files changed

Lines changed: 94 additions & 100 deletions

File tree

cdk/src/constructs/ec2-agent-fleet.ts

Lines changed: 4 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@
1717
* SOFTWARE.
1818
*/
1919

20-
import { Duration, RemovalPolicy } from 'aws-cdk-lib';
20+
import { Duration, RemovalPolicy, Tags } from 'aws-cdk-lib';
2121
import * as autoscaling from 'aws-cdk-lib/aws-autoscaling';
2222
import * as dynamodb from 'aws-cdk-lib/aws-dynamodb';
2323
import * as ec2 from 'aws-cdk-lib/aws-ec2';
@@ -184,10 +184,9 @@ export class Ec2AgentFleet extends Construct {
184184
healthCheck: autoscaling.HealthCheck.ec2(),
185185
});
186186

187-
// Tag the ASG instances for fleet identification
188-
// CDK auto-propagates tags from the ASG to instances
189-
this.autoScalingGroup.node.defaultChild;
190-
this.autoScalingGroup.addUserData(`aws ec2 create-tags --resources "$(ec2-metadata -i | cut -d' ' -f2)" --region "$(ec2-metadata --availability-zone | cut -d' ' -f2 | sed 's/.$//')" --tags Key=${this.fleetTagKey},Value=${this.fleetTagValue}`);
187+
// Tag ASG instances for fleet identification — CDK propagates these at launch
188+
Tags.of(this.autoScalingGroup).add(this.fleetTagKey, this.fleetTagValue);
189+
Tags.of(this.autoScalingGroup).add('bgagent:status', 'idle');
191190

192191
NagSuppressions.addResourceSuppressions(this.instanceRole, [
193192
{

cdk/src/constructs/task-api.ts

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -111,9 +111,7 @@ export interface TaskApiProps {
111111
* EC2 fleet configuration for cancel-task to stop EC2-backed tasks.
112112
* When provided, the cancel Lambda gets `ssm:CancelCommand` permission.
113113
*/
114-
readonly ec2FleetConfig?: {
115-
readonly instanceRoleArn: string;
116-
};
114+
readonly ec2FleetConfig?: Record<string, never>;
117115
}
118116

119117
/**

cdk/src/constructs/task-orchestrator.ts

Lines changed: 3 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -137,7 +137,6 @@ export interface TaskOrchestratorProps {
137137
readonly fleetTagValue: string;
138138
readonly payloadBucketName: string;
139139
readonly ecrImageUri: string;
140-
readonly instanceRoleArn: string;
141140
};
142141
}
143142

@@ -304,15 +303,9 @@ export class TaskOrchestrator extends Construct {
304303
resources: [`arn:${Aws.PARTITION}:s3:::${props.ec2Config.payloadBucketName}/*`],
305304
}));
306305

307-
this.fn.addToRolePolicy(new iam.PolicyStatement({
308-
actions: ['iam:PassRole'],
309-
resources: [props.ec2Config.instanceRoleArn],
310-
conditions: {
311-
StringEquals: {
312-
'iam:PassedToService': 'ec2.amazonaws.com',
313-
},
314-
},
315-
}));
306+
// Note: iam:PassRole is not needed — the orchestrator does not pass the
307+
// instance role to any EC2 API. The ASG launch template handles instance
308+
// profile association at fleet creation time.
316309
}
317310

318311
// Per-repo Secrets Manager grants (e.g. per-repo GitHub tokens from Blueprints)

cdk/src/handlers/orchestrate-task.ts

Lines changed: 27 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -41,8 +41,8 @@ interface OrchestrateTaskEvent {
4141

4242
const MAX_POLL_ATTEMPTS = 1020; // ~8.5h at 30s intervals
4343
const MAX_NON_RUNNING_POLLS = 10; // ~5min grace period for session to start
44-
const MAX_CONSECUTIVE_ECS_POLL_FAILURES = 3;
45-
const MAX_CONSECUTIVE_ECS_COMPLETED_POLLS = 5;
44+
const MAX_CONSECUTIVE_COMPUTE_POLL_FAILURES = 3;
45+
const MAX_CONSECUTIVE_COMPUTE_COMPLETED_POLLS = 5;
4646

4747
const durableHandler: DurableExecutionHandler<OrchestrateTaskEvent, void> = async (event, context) => {
4848
const { task_id: taskId } = event;
@@ -176,63 +176,62 @@ const durableHandler: DurableExecutionHandler<OrchestrateTaskEvent, void> = asyn
176176
'await-agent-completion',
177177
async (state) => {
178178
const ddbState = await pollTaskStatus(taskId, state);
179-
let consecutiveEcsPollFailures = 0;
180-
let consecutiveEcsCompletedPolls = 0;
179+
let consecutiveComputePollFailures = 0;
180+
let consecutiveComputeCompletedPolls = 0;
181+
const computeLabel = blueprintConfig.compute_type.toUpperCase();
181182

182-
// ECS compute-level crash detection: if DDB is not terminal, check ECS task status
183+
// Compute-level crash detection: if DDB is not terminal, check compute task status
183184
if (
184185
ddbState.lastStatus &&
185186
!TERMINAL_STATUSES.includes(ddbState.lastStatus) &&
186187
computeStrategy
187188
) {
188189
try {
189-
const ecsStatus = await computeStrategy.pollSession(sessionHandle);
190-
if (ecsStatus.status === 'failed') {
191-
const errorMsg = 'error' in ecsStatus ? ecsStatus.error : 'ECS task failed';
192-
logger.warn('ECS task failed before DDB terminal write', {
190+
const computeStatus = await computeStrategy.pollSession(sessionHandle);
191+
if (computeStatus.status === 'failed') {
192+
const errorMsg = 'error' in computeStatus ? computeStatus.error : `${computeLabel} task failed`;
193+
logger.warn(`${computeLabel} task failed before DDB terminal write`, {
193194
task_id: taskId,
194195
error: errorMsg,
195196
});
196-
await failTask(taskId, ddbState.lastStatus, `ECS container failed: ${errorMsg}`, task.user_id, true);
197+
await failTask(taskId, ddbState.lastStatus, `${computeLabel} compute failed: ${errorMsg}`, task.user_id, true);
197198
return { attempts: ddbState.attempts, lastStatus: TaskStatus.FAILED };
198199
}
199-
if (ecsStatus.status === 'completed') {
200-
consecutiveEcsCompletedPolls = (state.consecutiveEcsCompletedPolls ?? 0) + 1;
201-
if (consecutiveEcsCompletedPolls >= MAX_CONSECUTIVE_ECS_COMPLETED_POLLS) {
202-
// ECS task exited successfully but DDB never reached terminal — the agent
203-
// likely crashed after container exit code 0 but before writing status.
204-
logger.error('ECS task completed but DDB never caught up — failing task', {
200+
if (computeStatus.status === 'completed') {
201+
consecutiveComputeCompletedPolls = (state.consecutiveComputeCompletedPolls ?? 0) + 1;
202+
if (consecutiveComputeCompletedPolls >= MAX_CONSECUTIVE_COMPUTE_COMPLETED_POLLS) {
203+
logger.error(`${computeLabel} task completed but DDB never caught up — failing task`, {
205204
task_id: taskId,
206-
consecutive_completed_polls: consecutiveEcsCompletedPolls,
205+
consecutive_completed_polls: consecutiveComputeCompletedPolls,
207206
});
208-
await failTask(taskId, ddbState.lastStatus, `ECS task exited successfully but agent never wrote terminal status after ${consecutiveEcsCompletedPolls} polls`, task.user_id, true);
207+
await failTask(taskId, ddbState.lastStatus, `${computeLabel} task exited successfully but agent never wrote terminal status after ${consecutiveComputeCompletedPolls} polls`, task.user_id, true);
209208
return { attempts: ddbState.attempts, lastStatus: TaskStatus.FAILED };
210209
}
211-
logger.warn('ECS task completed but DDB not terminal — waiting for DDB catchup', {
210+
logger.warn(`${computeLabel} task completed but DDB not terminal — waiting for DDB catchup`, {
212211
task_id: taskId,
213-
consecutive_completed_polls: consecutiveEcsCompletedPolls,
212+
consecutive_completed_polls: consecutiveComputeCompletedPolls,
214213
});
215214
}
216215
} catch (err) {
217-
consecutiveEcsPollFailures = (state.consecutiveEcsPollFailures ?? 0) + 1;
218-
if (consecutiveEcsPollFailures >= MAX_CONSECUTIVE_ECS_POLL_FAILURES) {
219-
logger.error('ECS pollSession failed repeatedly — failing task', {
216+
consecutiveComputePollFailures = (state.consecutiveComputePollFailures ?? 0) + 1;
217+
if (consecutiveComputePollFailures >= MAX_CONSECUTIVE_COMPUTE_POLL_FAILURES) {
218+
logger.error(`${computeLabel} pollSession failed repeatedly — failing task`, {
220219
task_id: taskId,
221-
consecutive_failures: consecutiveEcsPollFailures,
220+
consecutive_failures: consecutiveComputePollFailures,
222221
error: err instanceof Error ? err.message : String(err),
223222
});
224-
await failTask(taskId, ddbState.lastStatus, `ECS poll failed ${consecutiveEcsPollFailures} consecutive times: ${err instanceof Error ? err.message : String(err)}`, task.user_id, true);
223+
await failTask(taskId, ddbState.lastStatus, `${computeLabel} poll failed ${consecutiveComputePollFailures} consecutive times: ${err instanceof Error ? err.message : String(err)}`, task.user_id, true);
225224
return { attempts: ddbState.attempts, lastStatus: TaskStatus.FAILED };
226225
}
227-
logger.warn('ECS pollSession check failed (non-fatal)', {
226+
logger.warn(`${computeLabel} pollSession check failed (non-fatal)`, {
228227
task_id: taskId,
229-
consecutive_failures: consecutiveEcsPollFailures,
228+
consecutive_failures: consecutiveComputePollFailures,
230229
error: err instanceof Error ? err.message : String(err),
231230
});
232231
}
233232
}
234233

235-
return { ...ddbState, consecutiveEcsPollFailures, consecutiveEcsCompletedPolls };
234+
return { ...ddbState, consecutiveComputePollFailures, consecutiveComputeCompletedPolls };
236235
},
237236
{
238237
initialState: { attempts: 0 },

cdk/src/handlers/shared/orchestrator.ts

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -47,10 +47,10 @@ export interface PollState {
4747
readonly lastStatus?: TaskStatusType;
4848
/** True when the agent stopped sending heartbeats while still RUNNING (likely crash/OOM). */
4949
readonly sessionUnhealthy?: boolean;
50-
/** Consecutive ECS poll failures — escalated to error after 3. */
51-
readonly consecutiveEcsPollFailures?: number;
52-
/** Consecutive polls where ECS reports completed but DDB is not terminal — escalated after 5. */
53-
readonly consecutiveEcsCompletedPolls?: number;
50+
/** Consecutive compute poll failures — escalated to error after 3. */
51+
readonly consecutiveComputePollFailures?: number;
52+
/** Consecutive polls where compute reports completed but DDB is not terminal — escalated after 5. */
53+
readonly consecutiveComputeCompletedPolls?: number;
5454
}
5555

5656
/** After RUNNING this long, we expect `agent_heartbeat_at` from the agent (if ever set). */

cdk/src/handlers/shared/strategies/ec2-strategy.ts

Lines changed: 54 additions & 48 deletions
Original file line numberDiff line numberDiff line change
@@ -105,69 +105,75 @@ export class Ec2ComputeStrategy implements ComputeStrategy {
105105
],
106106
}));
107107

108-
// 4. Build the boot command (mirrors ECS strategy env vars and Python boot command)
109-
const envExports = [
110-
`export TASK_ID='${taskId}'`,
111-
`export REPO_URL='${String(payload.repo_url ?? '')}'`,
112-
...(payload.prompt ? [`export TASK_DESCRIPTION='${String(payload.prompt).replace(/'/g, "'\\''")}'`] : []),
113-
...(payload.issue_number ? [`export ISSUE_NUMBER='${String(payload.issue_number)}'`] : []),
114-
`export MAX_TURNS='${String(payload.max_turns ?? 100)}'`,
115-
...(payload.max_budget_usd !== undefined ? [`export MAX_BUDGET_USD='${String(payload.max_budget_usd)}'`] : []),
116-
...(blueprintConfig.model_id ? [`export ANTHROPIC_MODEL='${blueprintConfig.model_id}'`] : []),
117-
...(blueprintConfig.system_prompt_overrides ? [`export SYSTEM_PROMPT_OVERRIDES='${blueprintConfig.system_prompt_overrides.replace(/'/g, "'\\''")}'`] : []),
118-
"export CLAUDE_CODE_USE_BEDROCK='1'",
119-
...(payload.github_token_secret_arn ? [`export GITHUB_TOKEN_SECRET_ARN='${String(payload.github_token_secret_arn)}'`] : []),
120-
...(payload.memory_id ? [`export MEMORY_ID='${String(payload.memory_id)}'`] : []),
121-
];
122-
108+
// 4. Build the boot script
109+
// All task data is read from the S3 payload at runtime to avoid shell
110+
// injection — no untrusted values are interpolated into the script.
111+
// Only infrastructure constants (bucket name, ECR URI) are embedded.
123112
const bootScript = [
124113
'#!/bin/bash',
125114
'set -euo pipefail',
126115
'',
116+
'# Derive region from IMDS (SSM does not always set AWS_REGION)',
117+
'export AWS_REGION=$(ec2-metadata --availability-zone | cut -d" " -f2 | sed \'s/.$/\'\'/)\'',
118+
'export AWS_DEFAULT_REGION="$AWS_REGION"',
119+
'',
120+
'# Resolve instance ID for tag cleanup',
121+
'INSTANCE_ID=$(ec2-metadata -i | cut -d" " -f2)',
122+
'',
123+
'# Cleanup trap — always retag instance as idle on exit (success, error, or signal)',
124+
'cleanup() {',
125+
' docker system prune -f || true',
126+
' rm -f /tmp/payload.json',
127+
` aws ec2 create-tags --resources "$INSTANCE_ID" --region "$AWS_REGION" --tags Key=bgagent:status,Value=idle || true`,
128+
` aws ec2 delete-tags --resources "$INSTANCE_ID" --region "$AWS_REGION" --tags Key=bgagent:task-id || true`,
129+
'}',
130+
'trap cleanup EXIT',
131+
'',
127132
'# Fetch payload from S3',
128133
`aws s3 cp "s3://${EC2_PAYLOAD_BUCKET}/${payloadKey}" /tmp/payload.json`,
129134
'export AGENT_PAYLOAD=$(cat /tmp/payload.json)',
130-
'',
131-
'# Set environment variables',
132-
...envExports,
135+
'export CLAUDE_CODE_USE_BEDROCK=1',
133136
'',
134137
'# ECR login and pull',
135-
`aws ecr get-login-password --region $AWS_REGION | docker login --username AWS --password-stdin $(echo '${ECR_IMAGE_URI}' | cut -d/ -f1)`,
138+
`aws ecr get-login-password --region "$AWS_REGION" | docker login --username AWS --password-stdin $(echo '${ECR_IMAGE_URI}' | cut -d/ -f1)`,
136139
`docker pull '${ECR_IMAGE_URI}'`,
137140
'',
138-
'# Run the agent container',
139-
'docker run --rm \\',
140-
' -e TASK_ID -e REPO_URL -e CLAUDE_CODE_USE_BEDROCK -e AGENT_PAYLOAD \\',
141-
' -e AWS_REGION -e AWS_DEFAULT_REGION \\',
142-
` ${payload.prompt ? '-e TASK_DESCRIPTION ' : ''}${payload.issue_number ? '-e ISSUE_NUMBER ' : ''}-e MAX_TURNS \\`,
143-
` ${payload.max_budget_usd !== undefined ? '-e MAX_BUDGET_USD ' : ''}${blueprintConfig.model_id ? '-e ANTHROPIC_MODEL ' : ''}${blueprintConfig.system_prompt_overrides ? '-e SYSTEM_PROMPT_OVERRIDES ' : ''}\\`,
144-
` ${payload.github_token_secret_arn ? '-e GITHUB_TOKEN_SECRET_ARN ' : ''}${payload.memory_id ? '-e MEMORY_ID ' : ''}\\`,
145-
` '${ECR_IMAGE_URI}' \\`,
141+
'# Run the agent container — all config is read from AGENT_PAYLOAD inside the container',
142+
`docker run --rm -e AGENT_PAYLOAD -e CLAUDE_CODE_USE_BEDROCK -e AWS_REGION -e AWS_DEFAULT_REGION '${ECR_IMAGE_URI}' \\`,
146143
' python -c \'import json, os, sys; sys.path.insert(0, "/app"); from entrypoint import run_task; p = json.loads(os.environ["AGENT_PAYLOAD"]); r = run_task(repo_url=p.get("repo_url",""), task_description=p.get("prompt",""), issue_number=str(p.get("issue_number","")), github_token=p.get("github_token",""), anthropic_model=p.get("model_id",""), max_turns=int(p.get("max_turns",100)), max_budget_usd=p.get("max_budget_usd"), aws_region=os.environ.get("AWS_REGION",""), task_id=p.get("task_id",""), hydrated_context=p.get("hydrated_context"), system_prompt_overrides=p.get("system_prompt_overrides",""), prompt_version=p.get("prompt_version",""), memory_id=p.get("memory_id",""), task_type=p.get("task_type","new_task"), branch_name=p.get("branch_name",""), pr_number=str(p.get("pr_number",""))); sys.exit(0 if r.get("status")=="success" else 1)\'',
147-
'',
148-
'# Cleanup',
149-
'docker system prune -f',
150-
'rm -f /tmp/payload.json',
151-
'',
152-
'# Tag instance back to idle',
153-
'INSTANCE_ID=$(ec2-metadata -i | cut -d" " -f2)',
154-
'aws ec2 create-tags --resources "$INSTANCE_ID" --tags Key=bgagent:status,Value=idle',
155-
'aws ec2 delete-tags --resources "$INSTANCE_ID" --tags Key=bgagent:task-id',
156144
].join('\n');
157145

158-
// 5. Send SSM Run Command
159-
const ssmResult = await getSsmClient().send(new SendCommandCommand({
160-
DocumentName: 'AWS-RunShellScript',
161-
InstanceIds: [instanceId],
162-
Parameters: {
163-
commands: [bootScript],
164-
},
165-
TimeoutSeconds: 32400, // 9 hours, matches orchestrator max
166-
}));
146+
// 5. Send SSM Run Command — rollback instance tags on failure
147+
let commandId: string;
148+
try {
149+
const ssmResult = await getSsmClient().send(new SendCommandCommand({
150+
DocumentName: 'AWS-RunShellScript',
151+
InstanceIds: [instanceId],
152+
Parameters: {
153+
commands: [bootScript],
154+
},
155+
TimeoutSeconds: 32400, // 9 hours, matches orchestrator max
156+
}));
167157

168-
const commandId = ssmResult.Command?.CommandId;
169-
if (!commandId) {
170-
throw new Error('SSM SendCommand returned no CommandId');
158+
if (!ssmResult.Command?.CommandId) {
159+
throw new Error('SSM SendCommand returned no CommandId');
160+
}
161+
commandId = ssmResult.Command.CommandId;
162+
} catch (err) {
163+
// Rollback: retag instance as idle so it's not stuck as busy
164+
try {
165+
await getEc2Client().send(new CreateTagsCommand({
166+
Resources: [instanceId],
167+
Tags: [{ Key: 'bgagent:status', Value: 'idle' }],
168+
}));
169+
await getEc2Client().send(new DeleteTagsCommand({
170+
Resources: [instanceId],
171+
Tags: [{ Key: 'bgagent:task-id' }],
172+
}));
173+
} catch {
174+
logger.warn('Failed to rollback instance tags after dispatch failure', { instance_id: instanceId, task_id: taskId });
175+
}
176+
throw err;
171177
}
172178

173179
logger.info('EC2 SSM command dispatched', {

cdk/src/stacks/agent.ts

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -339,7 +339,6 @@ export class AgentStack extends Stack {
339339
// fleetTagValue: ec2Fleet.fleetTagValue,
340340
// payloadBucketName: ec2Fleet.payloadBucket.bucketName,
341341
// ecrImageUri: agentImageAsset.imageUri,
342-
// instanceRoleArn: ec2Fleet.instanceRole.roleArn,
343342
// },
344343
});
345344

@@ -366,7 +365,7 @@ export class AgentStack extends Stack {
366365
// To allow cancel-task to stop ECS-backed tasks, uncomment:
367366
// ecsClusterArn: ecsCluster.cluster.clusterArn,
368367
// To allow cancel-task to stop EC2-backed tasks, uncomment:
369-
// ec2FleetConfig: { instanceRoleArn: ec2Fleet.instanceRole.roleArn },
368+
// ec2FleetConfig: {},
370369
});
371370

372371
// --- Operator dashboard ---

0 commit comments

Comments
 (0)