Skip to content

Commit 51882aa

Browse files
feat(compute): implement ECS Fargate backend via ComputeStrategy pattern
Wire ECS Fargate as a compute backend behind the existing ComputeStrategy interface, using the existing durable Lambda orchestrator. No separate stacks or Step Functions — ECS is a strategy option alongside AgentCore. Changes: - EcsComputeStrategy: startSession (RunTask), pollSession (DescribeTasks state mapping), stopSession (StopTask with graceful error handling) - EcsAgentCluster construct: ECS Cluster (container insights), Fargate task def (2 vCPU/4GB/ARM64), security group (TCP 443 egress only), CloudWatch log group, task role (DynamoDB, SecretsManager, Bedrock) - TaskOrchestrator: optional ECS props for env vars and IAM policies (ecs:RunTask/DescribeTasks/StopTask conditioned on cluster ARN, iam:PassRole conditioned on ecs-tasks.amazonaws.com) - Orchestrator polling: ECS compute-level crash detection alongside existing DDB polling (non-fatal, wrapped in try/catch) - AgentStack: conditional ECS infrastructure (ABCA_ENABLE_ECS env var) - Full test coverage: 15 ECS strategy tests, 9 construct tests, 5 orchestrator ECS tests. All 563 tests pass. Deployed and verified: stack deploys cleanly, CDK synth passes cdk-nag, agent task running on AgentCore path unaffected.
1 parent 4b004a3 commit 51882aa

12 files changed

Lines changed: 1720 additions & 19 deletions

File tree

cdk/package.json

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818
"@aws-cdk/mixins-preview": "2.238.0-alpha.0",
1919
"@aws-sdk/client-bedrock-agentcore": "^3.1021.0",
2020
"@aws-sdk/client-bedrock-runtime": "^3.1021.0",
21+
"@aws-sdk/client-ecs": "^3.1021.0",
2122
"@aws-sdk/client-dynamodb": "^3.1021.0",
2223
"@aws-sdk/client-lambda": "^3.1021.0",
2324
"@aws-sdk/client-secrets-manager": "^3.1021.0",
Lines changed: 150 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,150 @@
1+
/**
2+
* MIT No Attribution
3+
*
4+
* Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
5+
*
6+
* Permission is hereby granted, free of charge, to any person obtaining a copy of
7+
* the Software without restriction, including without limitation the rights to
8+
* use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
9+
* the Software, and to permit persons to whom the Software is furnished to do so.
10+
*
11+
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
12+
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
13+
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
14+
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
15+
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
16+
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
17+
* SOFTWARE.
18+
*/
19+
20+
import { RemovalPolicy } from 'aws-cdk-lib';
21+
import * as dynamodb from 'aws-cdk-lib/aws-dynamodb';
22+
import * as ec2 from 'aws-cdk-lib/aws-ec2';
23+
import * as ecr_assets from 'aws-cdk-lib/aws-ecr-assets';
24+
import * as ecs from 'aws-cdk-lib/aws-ecs';
25+
import * as iam from 'aws-cdk-lib/aws-iam';
26+
import * as logs from 'aws-cdk-lib/aws-logs';
27+
import * as secretsmanager from 'aws-cdk-lib/aws-secretsmanager';
28+
import { NagSuppressions } from 'cdk-nag';
29+
import { Construct } from 'constructs';
30+
31+
export interface EcsAgentClusterProps {
32+
readonly vpc: ec2.IVpc;
33+
readonly agentImageAsset: ecr_assets.DockerImageAsset;
34+
readonly taskTable: dynamodb.ITable;
35+
readonly taskEventsTable: dynamodb.ITable;
36+
readonly userConcurrencyTable: dynamodb.ITable;
37+
readonly githubTokenSecret: secretsmanager.ISecret;
38+
readonly memoryId?: string;
39+
}
40+
41+
export class EcsAgentCluster extends Construct {
42+
public readonly cluster: ecs.Cluster;
43+
public readonly taskDefinition: ecs.FargateTaskDefinition;
44+
public readonly securityGroup: ec2.SecurityGroup;
45+
public readonly containerName: string;
46+
47+
constructor(scope: Construct, id: string, props: EcsAgentClusterProps) {
48+
super(scope, id);
49+
50+
this.containerName = 'AgentContainer';
51+
52+
// ECS Cluster with Fargate capacity provider and container insights
53+
this.cluster = new ecs.Cluster(this, 'Cluster', {
54+
vpc: props.vpc,
55+
containerInsights: true,
56+
});
57+
58+
// Security group — egress TCP 443 only
59+
this.securityGroup = new ec2.SecurityGroup(this, 'TaskSG', {
60+
vpc: props.vpc,
61+
description: 'ECS Agent Tasks - egress TCP 443 only',
62+
allowAllOutbound: false,
63+
});
64+
65+
this.securityGroup.addEgressRule(
66+
ec2.Peer.anyIpv4(),
67+
ec2.Port.tcp(443),
68+
'Allow HTTPS egress (GitHub API, AWS services)',
69+
);
70+
71+
// CloudWatch log group for agent task output
72+
const logGroup = new logs.LogGroup(this, 'TaskLogGroup', {
73+
logGroupName: '/ecs/abca-agent-tasks',
74+
retention: logs.RetentionDays.THREE_MONTHS,
75+
removalPolicy: RemovalPolicy.DESTROY,
76+
});
77+
78+
// Task execution role (used by ECS agent to pull images, write logs)
79+
// CDK creates this automatically via taskDefinition, but we need to
80+
// grant additional permissions to the task role.
81+
82+
// Fargate task definition
83+
this.taskDefinition = new ecs.FargateTaskDefinition(this, 'TaskDef', {
84+
cpu: 2048,
85+
memoryLimitMiB: 4096,
86+
runtimePlatform: {
87+
cpuArchitecture: ecs.CpuArchitecture.ARM64,
88+
operatingSystemFamily: ecs.OperatingSystemFamily.LINUX,
89+
},
90+
});
91+
92+
// Container
93+
this.taskDefinition.addContainer(this.containerName, {
94+
image: ecs.ContainerImage.fromDockerImageAsset(props.agentImageAsset),
95+
logging: ecs.LogDrivers.awsLogs({
96+
logGroup,
97+
streamPrefix: 'agent',
98+
}),
99+
environment: {
100+
CLAUDE_CODE_USE_BEDROCK: '1',
101+
TASK_TABLE_NAME: props.taskTable.tableName,
102+
TASK_EVENTS_TABLE_NAME: props.taskEventsTable.tableName,
103+
USER_CONCURRENCY_TABLE_NAME: props.userConcurrencyTable.tableName,
104+
LOG_GROUP_NAME: logGroup.logGroupName,
105+
...(props.memoryId && { MEMORY_ID: props.memoryId }),
106+
},
107+
});
108+
109+
// Task role permissions
110+
const taskRole = this.taskDefinition.taskRole;
111+
112+
// DynamoDB read/write on task tables
113+
props.taskTable.grantReadWriteData(taskRole);
114+
props.taskEventsTable.grantReadWriteData(taskRole);
115+
props.userConcurrencyTable.grantReadWriteData(taskRole);
116+
117+
// Secrets Manager read for GitHub token
118+
props.githubTokenSecret.grantRead(taskRole);
119+
120+
// Bedrock model invocation
121+
taskRole.addToPrincipalPolicy(new iam.PolicyStatement({
122+
actions: [
123+
'bedrock:InvokeModel',
124+
'bedrock:InvokeModelWithResponseStream',
125+
],
126+
resources: ['*'],
127+
}));
128+
129+
// CloudWatch Logs write
130+
logGroup.grantWrite(taskRole);
131+
132+
NagSuppressions.addResourceSuppressions(this.taskDefinition, [
133+
{
134+
id: 'AwsSolutions-IAM5',
135+
reason: 'DynamoDB index/* wildcards generated by CDK grantReadWriteData; Bedrock InvokeModel requires * resource; Secrets Manager wildcards from CDK grantRead; CloudWatch Logs wildcards from CDK grantWrite',
136+
},
137+
{
138+
id: 'AwsSolutions-ECS2',
139+
reason: 'Environment variables contain table names and configuration, not secrets — GitHub token is fetched from Secrets Manager at runtime',
140+
},
141+
], true);
142+
143+
NagSuppressions.addResourceSuppressions(this.cluster, [
144+
{
145+
id: 'AwsSolutions-ECS4',
146+
reason: 'Container insights is enabled via the containerInsights prop',
147+
},
148+
], true);
149+
}
150+
}

cdk/src/constructs/task-orchestrator.ts

Lines changed: 59 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -100,6 +100,32 @@ export interface TaskOrchestratorProps {
100100
* and writes fallback episodes during finalization.
101101
*/
102102
readonly memoryId?: string;
103+
104+
/**
105+
* ARN of the ECS cluster for ECS compute strategy.
106+
* When provided, ECS-related env vars and IAM policies are added.
107+
*/
108+
readonly ecsClusterArn?: string;
109+
110+
/**
111+
* ARN of the ECS task definition for ECS compute strategy.
112+
*/
113+
readonly ecsTaskDefinitionArn?: string;
114+
115+
/**
116+
* Comma-separated subnet IDs for ECS tasks.
117+
*/
118+
readonly ecsSubnets?: string;
119+
120+
/**
121+
* Security group ID for ECS tasks.
122+
*/
123+
readonly ecsSecurityGroup?: string;
124+
125+
/**
126+
* Container name in the ECS task definition.
127+
*/
128+
readonly ecsContainerName?: string;
103129
}
104130

105131
/**
@@ -152,6 +178,11 @@ export class TaskOrchestrator extends Construct {
152178
USER_PROMPT_TOKEN_BUDGET: String(props.userPromptTokenBudget),
153179
}),
154180
...(props.memoryId && { MEMORY_ID: props.memoryId }),
181+
...(props.ecsClusterArn && { ECS_CLUSTER_ARN: props.ecsClusterArn }),
182+
...(props.ecsTaskDefinitionArn && { ECS_TASK_DEFINITION_ARN: props.ecsTaskDefinitionArn }),
183+
...(props.ecsSubnets && { ECS_SUBNETS: props.ecsSubnets }),
184+
...(props.ecsSecurityGroup && { ECS_SECURITY_GROUP: props.ecsSecurityGroup }),
185+
...(props.ecsContainerName && { ECS_CONTAINER_NAME: props.ecsContainerName }),
155186
},
156187
bundling: {
157188
externalModules: ['@aws-sdk/*'],
@@ -192,6 +223,33 @@ export class TaskOrchestrator extends Construct {
192223
resources: runtimeResources,
193224
}));
194225

226+
// ECS compute strategy permissions (only when ECS is configured)
227+
if (props.ecsClusterArn) {
228+
this.fn.addToRolePolicy(new iam.PolicyStatement({
229+
actions: [
230+
'ecs:RunTask',
231+
'ecs:DescribeTasks',
232+
'ecs:StopTask',
233+
],
234+
resources: ['*'],
235+
conditions: {
236+
ArnEquals: {
237+
'ecs:cluster': props.ecsClusterArn,
238+
},
239+
},
240+
}));
241+
242+
this.fn.addToRolePolicy(new iam.PolicyStatement({
243+
actions: ['iam:PassRole'],
244+
resources: ['*'],
245+
conditions: {
246+
StringEquals: {
247+
'iam:PassedToService': 'ecs-tasks.amazonaws.com',
248+
},
249+
},
250+
}));
251+
}
252+
195253
// Per-repo Secrets Manager grants (e.g. per-repo GitHub tokens from Blueprints)
196254
for (const [index, secretArn] of (props.additionalSecretArns ?? []).entries()) {
197255
const secret = secretsmanager.Secret.fromSecretCompleteArn(
@@ -229,7 +287,7 @@ export class TaskOrchestrator extends Construct {
229287
},
230288
{
231289
id: 'AwsSolutions-IAM5',
232-
reason: 'DynamoDB index/* wildcards generated by CDK grantReadWriteData; AgentCore runtime/* required for sub-resource invocation; Secrets Manager wildcards generated by CDK grantRead; AgentCore Memory wildcards generated by CDK grantRead/grantWrite',
290+
reason: 'DynamoDB index/* wildcards generated by CDK grantReadWriteData; AgentCore runtime/* required for sub-resource invocation; Secrets Manager wildcards generated by CDK grantRead; AgentCore Memory wildcards generated by CDK grantRead/grantWrite; ECS RunTask/DescribeTasks/StopTask conditioned on cluster ARN; iam:PassRole conditioned on ecs-tasks.amazonaws.com',
233291
},
234292
], true);
235293
}

cdk/src/handlers/orchestrate-task.ts

Lines changed: 35 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -119,7 +119,8 @@ const durableHandler: DurableExecutionHandler<OrchestrateTaskEvent, void> = asyn
119119
});
120120

121121
// Step 4: Start agent session — resolve compute strategy, invoke runtime, transition to RUNNING
122-
await context.step('start-session', async () => {
122+
// Returns the full SessionHandle (serializable) so ECS polling can use it in step 5.
123+
const sessionHandle = await context.step('start-session', async () => {
123124
try {
124125
const strategy = resolveComputeStrategy(blueprintConfig);
125126
const handle = await strategy.startSession({ taskId, payload, blueprintConfig });
@@ -139,24 +140,49 @@ const durableHandler: DurableExecutionHandler<OrchestrateTaskEvent, void> = asyn
139140
strategy_type: handle.strategyType,
140141
});
141142

142-
return handle.sessionId;
143+
return handle;
143144
} catch (err) {
144145
await failTask(taskId, TaskStatus.HYDRATING, `Session start failed: ${String(err)}`, task.user_id, true);
145146
throw err;
146147
}
147148
});
148149

149150
// Step 5: Wait for agent to finish
150-
// NOTE: Polls DynamoDB every 30s rather than re-invoking the AgentCore session.
151-
// The agent writes terminal status directly to DDB. If the agent crashes without
152-
// writing a terminal status, we detect it via the HYDRATING early-exit check
153-
// (MAX_NON_RUNNING_POLLS ~5min); otherwise the loop runs up to MAX_POLL_ATTEMPTS
154-
// (~8.5h). A future improvement could add AgentCore session status checks for
155-
// faster crash detection.
151+
// Polls DynamoDB every 30s. For ECS compute, also polls the ECS task to detect
152+
// container crashes that don't write terminal status to DDB. For AgentCore,
153+
// crash detection relies on the HYDRATING early-exit (MAX_NON_RUNNING_POLLS ~5min).
156154
const finalPollState = await context.waitForCondition<PollState>(
157155
'await-agent-completion',
158156
async (state) => {
159-
return pollTaskStatus(taskId, state);
157+
const ddbState = await pollTaskStatus(taskId, state);
158+
159+
// ECS compute-level crash detection: if DDB is not terminal, check ECS task status
160+
if (
161+
ddbState.lastStatus &&
162+
!TERMINAL_STATUSES.includes(ddbState.lastStatus) &&
163+
blueprintConfig.compute_type === 'ecs'
164+
) {
165+
try {
166+
const strategy = resolveComputeStrategy(blueprintConfig);
167+
const ecsStatus = await strategy.pollSession(sessionHandle);
168+
if (ecsStatus.status === 'failed') {
169+
const errorMsg = 'error' in ecsStatus ? ecsStatus.error : 'ECS task failed';
170+
logger.warn('ECS task failed before DDB terminal write', {
171+
task_id: taskId,
172+
error: errorMsg,
173+
});
174+
await failTask(taskId, ddbState.lastStatus, `ECS container failed: ${errorMsg}`, task.user_id, true);
175+
return { attempts: ddbState.attempts, lastStatus: TaskStatus.FAILED };
176+
}
177+
} catch (err) {
178+
logger.warn('ECS pollSession check failed (non-fatal)', {
179+
task_id: taskId,
180+
error: err instanceof Error ? err.message : String(err),
181+
});
182+
}
183+
}
184+
185+
return ddbState;
160186
},
161187
{
162188
initialState: { attempts: 0 },

cdk/src/handlers/shared/compute-strategy.ts

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919

2020
import type { BlueprintConfig, ComputeType } from './repo-config';
2121
import { AgentCoreComputeStrategy } from './strategies/agentcore-strategy';
22+
import { EcsComputeStrategy } from './strategies/ecs-strategy';
2223

2324
export interface SessionHandle {
2425
readonly sessionId: string;
@@ -48,7 +49,7 @@ export function resolveComputeStrategy(blueprintConfig: BlueprintConfig): Comput
4849
case 'agentcore':
4950
return new AgentCoreComputeStrategy();
5051
case 'ecs':
51-
throw new Error("compute_type 'ecs' is not yet implemented");
52+
return new EcsComputeStrategy();
5253
default: {
5354
const _exhaustive: never = computeType;
5455
throw new Error(`Unknown compute_type: '${_exhaustive}'`);

0 commit comments

Comments
 (0)