Skip to content

Commit bdb89f4

Browse files
feat(compute): add EC2 fleet compute strategy with SSM dispatch
Add a third compute backend (EC2 fleet with SSM Run Command) alongside the existing AgentCore and ECS strategies. This provides maximum flexibility with no image size limits, configurable instance types (including GPU), and full control over the compute environment. New files: - ec2-strategy.ts: ComputeStrategy implementation using EC2 tags for instance tracking and SSM RunShellScript for task dispatch - ec2-agent-fleet.ts: CDK construct with ASG, launch template, security group, S3 payload bucket, and IAM role - ec2-strategy.test.ts and ec2-agent-fleet.test.ts: full test coverage Wiring: - repo-config.ts: add 'ec2' to ComputeType, add instance_type field - compute-strategy.ts: add EC2 SessionHandle variant and resolver case - task-orchestrator.ts: add ec2Config prop with env vars and IAM grants - orchestrate-task.ts: enable compute polling for EC2 - cancel-task.ts: add SSM CancelCommand for EC2 tasks - task-api.ts: add ssm:CancelCommand permission for cancel Lambda - agent.ts: add commented-out EC2 fleet block (same pattern as ECS)
1 parent a53282d commit bdb89f4

16 files changed

Lines changed: 1604 additions & 7 deletions

cdk/package.json

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,10 +18,13 @@
1818
"@aws-cdk/mixins-preview": "2.238.0-alpha.0",
1919
"@aws-sdk/client-bedrock-agentcore": "^3.1021.0",
2020
"@aws-sdk/client-bedrock-runtime": "^3.1021.0",
21+
"@aws-sdk/client-ec2": "^3.1021.0",
2122
"@aws-sdk/client-ecs": "^3.1021.0",
2223
"@aws-sdk/client-dynamodb": "^3.1021.0",
2324
"@aws-sdk/client-lambda": "^3.1021.0",
25+
"@aws-sdk/client-s3": "^3.1021.0",
2426
"@aws-sdk/client-secrets-manager": "^3.1021.0",
27+
"@aws-sdk/client-ssm": "^3.1021.0",
2528
"@aws-sdk/lib-dynamodb": "^3.1021.0",
2629
"@aws/durable-execution-sdk-js": "^1.1.0",
2730
"aws-cdk-lib": "^2.238.0",

cdk/src/constructs/blueprint.ts

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -47,7 +47,7 @@ export interface BlueprintProps {
4747
* Compute strategy type.
4848
* @default 'agentcore'
4949
*/
50-
readonly type?: 'agentcore' | 'ecs';
50+
readonly type?: 'agentcore' | 'ecs' | 'ec2';
5151

5252
/**
5353
* Override the default runtime ARN (agentcore strategy).
Lines changed: 221 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,221 @@
1+
/**
2+
* MIT No Attribution
3+
*
4+
* Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
5+
*
6+
* Permission is hereby granted, free of charge, to any person obtaining a copy of
7+
* the Software without restriction, including without limitation the rights to
8+
* use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
9+
* the Software, and to permit persons to whom the Software is furnished to do so.
10+
*
11+
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
12+
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
13+
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
14+
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
15+
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
16+
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
17+
* SOFTWARE.
18+
*/
19+
20+
import { Duration, RemovalPolicy } from 'aws-cdk-lib';
21+
import * as autoscaling from 'aws-cdk-lib/aws-autoscaling';
22+
import * as dynamodb from 'aws-cdk-lib/aws-dynamodb';
23+
import * as ec2 from 'aws-cdk-lib/aws-ec2';
24+
import * as ecr_assets from 'aws-cdk-lib/aws-ecr-assets';
25+
import * as iam from 'aws-cdk-lib/aws-iam';
26+
import * as logs from 'aws-cdk-lib/aws-logs';
27+
import * as s3 from 'aws-cdk-lib/aws-s3';
28+
import * as secretsmanager from 'aws-cdk-lib/aws-secretsmanager';
29+
import { NagSuppressions } from 'cdk-nag';
30+
import { Construct } from 'constructs';
31+
32+
export interface Ec2AgentFleetProps {
33+
readonly vpc: ec2.IVpc;
34+
readonly agentImageAsset: ecr_assets.DockerImageAsset;
35+
readonly taskTable: dynamodb.ITable;
36+
readonly taskEventsTable: dynamodb.ITable;
37+
readonly userConcurrencyTable: dynamodb.ITable;
38+
readonly githubTokenSecret: secretsmanager.ISecret;
39+
readonly memoryId?: string;
40+
readonly instanceType?: ec2.InstanceType;
41+
readonly desiredCapacity?: number;
42+
readonly maxCapacity?: number;
43+
}
44+
45+
export class Ec2AgentFleet extends Construct {
46+
public readonly securityGroup: ec2.SecurityGroup;
47+
public readonly instanceRole: iam.Role;
48+
public readonly payloadBucket: s3.Bucket;
49+
public readonly autoScalingGroup: autoscaling.AutoScalingGroup;
50+
public readonly fleetTagKey: string;
51+
public readonly fleetTagValue: string;
52+
53+
constructor(scope: Construct, id: string, props: Ec2AgentFleetProps) {
54+
super(scope, id);
55+
56+
this.fleetTagKey = 'bgagent:fleet';
57+
this.fleetTagValue = id;
58+
59+
// Security group — egress TCP 443 only
60+
this.securityGroup = new ec2.SecurityGroup(this, 'FleetSG', {
61+
vpc: props.vpc,
62+
description: 'EC2 Agent Fleet - egress TCP 443 only',
63+
allowAllOutbound: false,
64+
});
65+
66+
this.securityGroup.addEgressRule(
67+
ec2.Peer.anyIpv4(),
68+
ec2.Port.tcp(443),
69+
'Allow HTTPS egress (GitHub API, AWS services)',
70+
);
71+
72+
// S3 bucket for payload overflow
73+
this.payloadBucket = new s3.Bucket(this, 'PayloadBucket', {
74+
removalPolicy: RemovalPolicy.DESTROY,
75+
autoDeleteObjects: true,
76+
encryption: s3.BucketEncryption.S3_MANAGED,
77+
enforceSSL: true,
78+
blockPublicAccess: s3.BlockPublicAccess.BLOCK_ALL,
79+
lifecycleRules: [
80+
{ expiration: Duration.days(7) },
81+
],
82+
});
83+
84+
// CloudWatch log group
85+
const logGroup = new logs.LogGroup(this, 'FleetLogGroup', {
86+
retention: logs.RetentionDays.THREE_MONTHS,
87+
removalPolicy: RemovalPolicy.DESTROY,
88+
});
89+
90+
// IAM Role for instances
91+
this.instanceRole = new iam.Role(this, 'InstanceRole', {
92+
assumedBy: new iam.ServicePrincipal('ec2.amazonaws.com'),
93+
managedPolicies: [
94+
iam.ManagedPolicy.fromAwsManagedPolicyName('AmazonSSMManagedInstanceCore'),
95+
],
96+
});
97+
98+
// DynamoDB read/write on task tables
99+
props.taskTable.grantReadWriteData(this.instanceRole);
100+
props.taskEventsTable.grantReadWriteData(this.instanceRole);
101+
props.userConcurrencyTable.grantReadWriteData(this.instanceRole);
102+
103+
// Secrets Manager read for GitHub token
104+
props.githubTokenSecret.grantRead(this.instanceRole);
105+
106+
// Bedrock model invocation
107+
this.instanceRole.addToPrincipalPolicy(new iam.PolicyStatement({
108+
actions: [
109+
'bedrock:InvokeModel',
110+
'bedrock:InvokeModelWithResponseStream',
111+
],
112+
resources: ['*'],
113+
}));
114+
115+
// CloudWatch Logs write
116+
logGroup.grantWrite(this.instanceRole);
117+
118+
// ECR pull
119+
this.instanceRole.addToPrincipalPolicy(new iam.PolicyStatement({
120+
actions: [
121+
'ecr:GetAuthorizationToken',
122+
],
123+
resources: ['*'],
124+
}));
125+
this.instanceRole.addToPrincipalPolicy(new iam.PolicyStatement({
126+
actions: [
127+
'ecr:BatchGetImage',
128+
'ecr:GetDownloadUrlForLayer',
129+
],
130+
resources: [props.agentImageAsset.repository.repositoryArn],
131+
}));
132+
133+
// S3 read on payload bucket
134+
this.payloadBucket.grantRead(this.instanceRole);
135+
136+
// EC2 tag management on self (conditioned on fleet tag)
137+
this.instanceRole.addToPrincipalPolicy(new iam.PolicyStatement({
138+
actions: ['ec2:CreateTags', 'ec2:DeleteTags'],
139+
resources: ['*'],
140+
conditions: {
141+
StringEquals: {
142+
[`ec2:ResourceTag/${this.fleetTagKey}`]: this.fleetTagValue,
143+
},
144+
},
145+
}));
146+
147+
const imageUri = props.agentImageAsset.imageUri;
148+
149+
// User data: install Docker, pull image, tag as idle
150+
const userData = ec2.UserData.forLinux();
151+
userData.addCommands(
152+
'#!/bin/bash',
153+
'set -euo pipefail',
154+
'',
155+
'# Install Docker',
156+
'dnf install -y docker',
157+
'systemctl enable docker',
158+
'systemctl start docker',
159+
'',
160+
'# ECR login and pre-pull agent image',
161+
'REGION=$(ec2-metadata --availability-zone | cut -d" " -f2 | sed \'s/.$//\')',
162+
`aws ecr get-login-password --region "$REGION" | docker login --username AWS --password-stdin $(echo '${imageUri}' | cut -d/ -f1)`,
163+
`docker pull '${imageUri}'`,
164+
'',
165+
'# Tag self as idle',
166+
'INSTANCE_ID=$(ec2-metadata -i | cut -d" " -f2)',
167+
'aws ec2 create-tags --resources "$INSTANCE_ID" --region "$REGION" --tags Key=bgagent:status,Value=idle',
168+
);
169+
170+
// Auto Scaling Group
171+
this.autoScalingGroup = new autoscaling.AutoScalingGroup(this, 'ASG', {
172+
vpc: props.vpc,
173+
vpcSubnets: { subnetType: ec2.SubnetType.PRIVATE_WITH_EGRESS },
174+
instanceType: props.instanceType ?? new ec2.InstanceType('m7g.xlarge'),
175+
machineImage: ec2.MachineImage.latestAmazonLinux2023({
176+
cpuType: ec2.AmazonLinuxCpuType.ARM_64,
177+
}),
178+
role: this.instanceRole,
179+
securityGroup: this.securityGroup,
180+
userData,
181+
desiredCapacity: props.desiredCapacity ?? 1,
182+
minCapacity: props.desiredCapacity ?? 1,
183+
maxCapacity: props.maxCapacity ?? 3,
184+
healthCheck: autoscaling.HealthCheck.ec2(),
185+
});
186+
187+
// Tag the ASG instances for fleet identification
188+
// CDK auto-propagates tags from the ASG to instances
189+
this.autoScalingGroup.node.defaultChild;
190+
this.autoScalingGroup.addUserData(`aws ec2 create-tags --resources "$(ec2-metadata -i | cut -d' ' -f2)" --region "$(ec2-metadata --availability-zone | cut -d' ' -f2 | sed 's/.$//')" --tags Key=${this.fleetTagKey},Value=${this.fleetTagValue}`);
191+
192+
NagSuppressions.addResourceSuppressions(this.instanceRole, [
193+
{
194+
id: 'AwsSolutions-IAM4',
195+
reason: 'AmazonSSMManagedInstanceCore is the AWS-recommended managed policy for SSM-managed instances',
196+
},
197+
{
198+
id: 'AwsSolutions-IAM5',
199+
reason: 'DynamoDB index/* wildcards generated by CDK grantReadWriteData; Bedrock InvokeModel requires * resource; Secrets Manager wildcards from CDK grantRead; CloudWatch Logs wildcards from CDK grantWrite; ECR GetAuthorizationToken requires * resource; EC2 CreateTags/DeleteTags conditioned on fleet tag; S3 read wildcards from CDK grantRead',
200+
},
201+
], true);
202+
203+
NagSuppressions.addResourceSuppressions(this.autoScalingGroup, [
204+
{
205+
id: 'AwsSolutions-AS3',
206+
reason: 'ASG scaling notifications are not required for this dev/preview compute backend',
207+
},
208+
{
209+
id: 'AwsSolutions-EC26',
210+
reason: 'EBS encryption uses default AWS-managed key — sufficient for agent ephemeral workloads',
211+
},
212+
], true);
213+
214+
NagSuppressions.addResourceSuppressions(this.payloadBucket, [
215+
{
216+
id: 'AwsSolutions-S1',
217+
reason: 'Server access logging not required for ephemeral payload overflow bucket with 7-day lifecycle',
218+
},
219+
], true);
220+
}
221+
}

cdk/src/constructs/task-api.ts

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -106,6 +106,14 @@ export interface TaskApiProps {
106106
* When provided, the cancel Lambda gets `ECS_CLUSTER_ARN` env var and `ecs:StopTask` permission.
107107
*/
108108
readonly ecsClusterArn?: string;
109+
110+
/**
111+
* EC2 fleet configuration for cancel-task to stop EC2-backed tasks.
112+
* When provided, the cancel Lambda gets `ssm:CancelCommand` permission.
113+
*/
114+
readonly ec2FleetConfig?: {
115+
readonly instanceRoleArn: string;
116+
};
109117
}
110118

111119
/**
@@ -384,6 +392,13 @@ export class TaskApi extends Construct {
384392
}));
385393
}
386394

395+
if (props.ec2FleetConfig) {
396+
cancelTaskFn.addToRolePolicy(new iam.PolicyStatement({
397+
actions: ['ssm:CancelCommand'],
398+
resources: ['*'],
399+
}));
400+
}
401+
387402
// Repo table read for onboarding gate
388403
if (props.repoTable) {
389404
props.repoTable.grantReadData(createTaskFn);

cdk/src/constructs/task-orchestrator.ts

Lines changed: 54 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@
1818
*/
1919

2020
import * as path from 'path';
21-
import { Duration, Stack } from 'aws-cdk-lib';
21+
import { Aws, Duration, Stack } from 'aws-cdk-lib';
2222
import * as cloudwatch from 'aws-cdk-lib/aws-cloudwatch';
2323
import * as dynamodb from 'aws-cdk-lib/aws-dynamodb';
2424
import * as iam from 'aws-cdk-lib/aws-iam';
@@ -127,6 +127,18 @@ export interface TaskOrchestratorProps {
127127
readonly taskRoleArn: string;
128128
readonly executionRoleArn: string;
129129
};
130+
131+
/**
132+
* EC2 fleet compute strategy configuration.
133+
* When provided, EC2-related env vars and IAM policies are added to the orchestrator.
134+
*/
135+
readonly ec2Config?: {
136+
readonly fleetTagKey: string;
137+
readonly fleetTagValue: string;
138+
readonly payloadBucketName: string;
139+
readonly ecrImageUri: string;
140+
readonly instanceRoleArn: string;
141+
};
130142
}
131143

132144
/**
@@ -195,6 +207,12 @@ export class TaskOrchestrator extends Construct {
195207
ECS_SECURITY_GROUP: props.ecsConfig.securityGroup,
196208
ECS_CONTAINER_NAME: props.ecsConfig.containerName,
197209
}),
210+
...(props.ec2Config && {
211+
EC2_FLEET_TAG_KEY: props.ec2Config.fleetTagKey,
212+
EC2_FLEET_TAG_VALUE: props.ec2Config.fleetTagValue,
213+
EC2_PAYLOAD_BUCKET: props.ec2Config.payloadBucketName,
214+
ECR_IMAGE_URI: props.ec2Config.ecrImageUri,
215+
}),
198216
},
199217
bundling: {
200218
externalModules: ['@aws-sdk/*'],
@@ -262,6 +280,41 @@ export class TaskOrchestrator extends Construct {
262280
}));
263281
}
264282

283+
// EC2 fleet compute strategy permissions (only when EC2 is configured)
284+
if (props.ec2Config) {
285+
this.fn.addToRolePolicy(new iam.PolicyStatement({
286+
actions: [
287+
'ec2:DescribeInstances',
288+
'ec2:CreateTags',
289+
],
290+
resources: ['*'],
291+
}));
292+
293+
this.fn.addToRolePolicy(new iam.PolicyStatement({
294+
actions: [
295+
'ssm:SendCommand',
296+
'ssm:GetCommandInvocation',
297+
'ssm:CancelCommand',
298+
],
299+
resources: ['*'],
300+
}));
301+
302+
this.fn.addToRolePolicy(new iam.PolicyStatement({
303+
actions: ['s3:PutObject'],
304+
resources: [`arn:${Aws.PARTITION}:s3:::${props.ec2Config.payloadBucketName}/*`],
305+
}));
306+
307+
this.fn.addToRolePolicy(new iam.PolicyStatement({
308+
actions: ['iam:PassRole'],
309+
resources: [props.ec2Config.instanceRoleArn],
310+
conditions: {
311+
StringEquals: {
312+
'iam:PassedToService': 'ec2.amazonaws.com',
313+
},
314+
},
315+
}));
316+
}
317+
265318
// Per-repo Secrets Manager grants (e.g. per-repo GitHub tokens from Blueprints)
266319
for (const [index, secretArn] of (props.additionalSecretArns ?? []).entries()) {
267320
const secret = secretsmanager.Secret.fromSecretCompleteArn(

0 commit comments

Comments
 (0)