Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions .github/workflows/build.yml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions .gitignore

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions cdk/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
"@aws-cdk/mixins-preview": "2.238.0-alpha.0",
"@aws-sdk/client-bedrock-agentcore": "^3.1021.0",
"@aws-sdk/client-bedrock-runtime": "^3.1021.0",
"@aws-sdk/client-ecs": "^3.1021.0",
"@aws-sdk/client-dynamodb": "^3.1021.0",
"@aws-sdk/client-lambda": "^3.1021.0",
"@aws-sdk/client-secrets-manager": "^3.1021.0",
Expand Down
156 changes: 156 additions & 0 deletions cdk/src/constructs/ecs-agent-cluster.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,156 @@
/**
* MIT No Attribution
*
* Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy of
* the Software without restriction, including without limitation the rights to
* use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
* the Software, and to permit persons to whom the Software is furnished to do so.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/

import { RemovalPolicy } from 'aws-cdk-lib';
import * as dynamodb from 'aws-cdk-lib/aws-dynamodb';
import * as ec2 from 'aws-cdk-lib/aws-ec2';
import * as ecr_assets from 'aws-cdk-lib/aws-ecr-assets';
import * as ecs from 'aws-cdk-lib/aws-ecs';
import * as iam from 'aws-cdk-lib/aws-iam';
import * as logs from 'aws-cdk-lib/aws-logs';
import * as secretsmanager from 'aws-cdk-lib/aws-secretsmanager';
import { NagSuppressions } from 'cdk-nag';
import { Construct } from 'constructs';

export interface EcsAgentClusterProps {
readonly vpc: ec2.IVpc;
readonly agentImageAsset: ecr_assets.DockerImageAsset;
readonly taskTable: dynamodb.ITable;
readonly taskEventsTable: dynamodb.ITable;
readonly userConcurrencyTable: dynamodb.ITable;
readonly githubTokenSecret: secretsmanager.ISecret;
readonly memoryId?: string;
}

export class EcsAgentCluster extends Construct {
public readonly cluster: ecs.Cluster;
public readonly taskDefinition: ecs.FargateTaskDefinition;
public readonly securityGroup: ec2.SecurityGroup;
public readonly containerName: string;
public readonly taskRoleArn: string;
public readonly executionRoleArn: string;

constructor(scope: Construct, id: string, props: EcsAgentClusterProps) {
super(scope, id);

this.containerName = 'AgentContainer';

// ECS Cluster with Fargate capacity provider and container insights
this.cluster = new ecs.Cluster(this, 'Cluster', {
vpc: props.vpc,
containerInsights: true,
});

// Security group — egress TCP 443 only
this.securityGroup = new ec2.SecurityGroup(this, 'TaskSG', {
vpc: props.vpc,
description: 'ECS Agent Tasks - egress TCP 443 only',
allowAllOutbound: false,
});

this.securityGroup.addEgressRule(
ec2.Peer.anyIpv4(),
ec2.Port.tcp(443),
'Allow HTTPS egress (GitHub API, AWS services)',
);

// CloudWatch log group for agent task output
const logGroup = new logs.LogGroup(this, 'TaskLogGroup', {
retention: logs.RetentionDays.THREE_MONTHS,
removalPolicy: RemovalPolicy.DESTROY,
});

// Task execution role (used by ECS agent to pull images, write logs)
// CDK creates this automatically via taskDefinition, but we need to
// grant additional permissions to the task role.

// Fargate task definition
this.taskDefinition = new ecs.FargateTaskDefinition(this, 'TaskDef', {
cpu: 2048,
memoryLimitMiB: 4096,
runtimePlatform: {
cpuArchitecture: ecs.CpuArchitecture.ARM64,
operatingSystemFamily: ecs.OperatingSystemFamily.LINUX,
},
});

// Container
this.taskDefinition.addContainer(this.containerName, {
image: ecs.ContainerImage.fromDockerImageAsset(props.agentImageAsset),
logging: ecs.LogDrivers.awsLogs({
logGroup,
streamPrefix: 'agent',
}),
environment: {
CLAUDE_CODE_USE_BEDROCK: '1',
TASK_TABLE_NAME: props.taskTable.tableName,
TASK_EVENTS_TABLE_NAME: props.taskEventsTable.tableName,
USER_CONCURRENCY_TABLE_NAME: props.userConcurrencyTable.tableName,
LOG_GROUP_NAME: logGroup.logGroupName,
GITHUB_TOKEN_SECRET_ARN: props.githubTokenSecret.secretArn,
...(props.memoryId && { MEMORY_ID: props.memoryId }),
},
});

// Task role permissions
const taskRole = this.taskDefinition.taskRole;

// DynamoDB read/write on task tables
props.taskTable.grantReadWriteData(taskRole);
props.taskEventsTable.grantReadWriteData(taskRole);
props.userConcurrencyTable.grantReadWriteData(taskRole);

// Secrets Manager read for GitHub token
props.githubTokenSecret.grantRead(taskRole);

// Bedrock model invocation
taskRole.addToPrincipalPolicy(new iam.PolicyStatement({
actions: [
'bedrock:InvokeModel',
'bedrock:InvokeModelWithResponseStream',
],
resources: ['*'],
}));

// CloudWatch Logs write
logGroup.grantWrite(taskRole);

// Expose role ARNs for scoped iam:PassRole in the orchestrator
this.taskRoleArn = taskRole.roleArn;
this.executionRoleArn = this.taskDefinition.executionRole!.roleArn;

NagSuppressions.addResourceSuppressions(this.taskDefinition, [
{
id: 'AwsSolutions-IAM5',
reason: 'DynamoDB index/* wildcards generated by CDK grantReadWriteData; Bedrock InvokeModel requires * resource; Secrets Manager wildcards from CDK grantRead; CloudWatch Logs wildcards from CDK grantWrite',
},
{
id: 'AwsSolutions-ECS2',
reason: 'Environment variables contain table names and configuration, not secrets — GitHub token is fetched from Secrets Manager at runtime',
},
], true);

NagSuppressions.addResourceSuppressions(this.cluster, [
{
id: 'AwsSolutions-ECS4',
reason: 'Container insights is enabled via the containerInsights prop',
},
], true);
}
}
21 changes: 21 additions & 0 deletions cdk/src/constructs/task-api.ts
Original file line number Diff line number Diff line change
Expand Up @@ -100,6 +100,12 @@ export interface TaskApiProps {
* First ARN is also passed as `RUNTIME_ARN` when the task record has no `agent_runtime_arn`.
*/
readonly agentCoreStopSessionRuntimeArns?: string[];

/**
* ECS cluster ARN for cancel-task to stop ECS-backed tasks.
* When provided, the cancel Lambda gets `ECS_CLUSTER_ARN` env var and `ecs:StopTask` permission.
*/
readonly ecsClusterArn?: string;
}

/**
Expand Down Expand Up @@ -329,6 +335,9 @@ export class TaskApi extends Construct {
if (stopSessionArns.length > 0) {
cancelTaskEnv.RUNTIME_ARN = stopSessionArns[0]!;
}
if (props.ecsClusterArn) {
cancelTaskEnv.ECS_CLUSTER_ARN = props.ecsClusterArn;
}

const cancelTaskFn = new lambda.NodejsFunction(this, 'CancelTaskFn', {
entry: path.join(handlersDir, 'cancel-task.ts'),
Expand Down Expand Up @@ -363,6 +372,18 @@ export class TaskApi extends Construct {
}));
}

if (props.ecsClusterArn) {
cancelTaskFn.addToRolePolicy(new iam.PolicyStatement({
actions: ['ecs:StopTask'],
resources: ['*'],
conditions: {
ArnEquals: {
'ecs:cluster': props.ecsClusterArn,
},
},
}));
}

// Repo table read for onboarding gate
if (props.repoTable) {
props.repoTable.grantReadData(createTaskFn);
Expand Down
51 changes: 50 additions & 1 deletion cdk/src/constructs/task-orchestrator.ts
Original file line number Diff line number Diff line change
Expand Up @@ -112,6 +112,21 @@ export interface TaskOrchestratorProps {
* Bedrock Guardrail version. Required when guardrailId is provided.
*/
readonly guardrailVersion?: string;

/**
* ECS Fargate compute strategy configuration.
* When provided, ECS-related env vars and IAM policies are added to the orchestrator.
* All fields are required — this makes the all-or-nothing constraint self-evident at the type level.
*/
readonly ecsConfig?: {
readonly clusterArn: string;
readonly taskDefinitionArn: string;
readonly subnets: string;
readonly securityGroup: string;
readonly containerName: string;
readonly taskRoleArn: string;
readonly executionRoleArn: string;
};
}

/**
Expand Down Expand Up @@ -173,6 +188,13 @@ export class TaskOrchestrator extends Construct {
...(props.memoryId && { MEMORY_ID: props.memoryId }),
...(props.guardrailId && { GUARDRAIL_ID: props.guardrailId }),
...(props.guardrailVersion && { GUARDRAIL_VERSION: props.guardrailVersion }),
...(props.ecsConfig && {
ECS_CLUSTER_ARN: props.ecsConfig.clusterArn,
ECS_TASK_DEFINITION_ARN: props.ecsConfig.taskDefinitionArn,
ECS_SUBNETS: props.ecsConfig.subnets,
ECS_SECURITY_GROUP: props.ecsConfig.securityGroup,
ECS_CONTAINER_NAME: props.ecsConfig.containerName,
}),
},
bundling: {
externalModules: ['@aws-sdk/*'],
Expand Down Expand Up @@ -213,6 +235,33 @@ export class TaskOrchestrator extends Construct {
resources: runtimeResources,
}));

// ECS compute strategy permissions (only when ECS is configured)
if (props.ecsConfig) {
this.fn.addToRolePolicy(new iam.PolicyStatement({
actions: [
'ecs:RunTask',
'ecs:DescribeTasks',
'ecs:StopTask',
],
resources: ['*'],
conditions: {
ArnEquals: {
'ecs:cluster': props.ecsConfig.clusterArn,
},
},
}));

this.fn.addToRolePolicy(new iam.PolicyStatement({
actions: ['iam:PassRole'],
resources: [props.ecsConfig.taskRoleArn, props.ecsConfig.executionRoleArn],
conditions: {
StringEquals: {
'iam:PassedToService': 'ecs-tasks.amazonaws.com',
},
},
}));
Comment thread
MichaelWalker-git marked this conversation as resolved.
}

// Per-repo Secrets Manager grants (e.g. per-repo GitHub tokens from Blueprints)
for (const [index, secretArn] of (props.additionalSecretArns ?? []).entries()) {
const secret = secretsmanager.Secret.fromSecretCompleteArn(
Expand Down Expand Up @@ -264,7 +313,7 @@ export class TaskOrchestrator extends Construct {
},
{
id: 'AwsSolutions-IAM5',
reason: 'DynamoDB index/* wildcards generated by CDK grantReadWriteData; AgentCore runtime/* required for sub-resource invocation; Secrets Manager wildcards generated by CDK grantRead; AgentCore Memory wildcards generated by CDK grantRead/grantWrite',
reason: 'DynamoDB index/* wildcards generated by CDK grantReadWriteData; AgentCore runtime/* required for sub-resource invocation; Secrets Manager wildcards generated by CDK grantRead; AgentCore Memory wildcards generated by CDK grantRead/grantWrite; ECS RunTask/DescribeTasks/StopTask conditioned on cluster ARN; iam:PassRole scoped to ECS task/execution roles and conditioned on ecs-tasks.amazonaws.com',
},
], true);
}
Expand Down
63 changes: 52 additions & 11 deletions cdk/src/handlers/cancel-task.ts
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@

import { BedrockAgentCoreClient, StopRuntimeSessionCommand } from '@aws-sdk/client-bedrock-agentcore';
import { DynamoDBClient } from '@aws-sdk/client-dynamodb';
import { ECSClient, StopTaskCommand } from '@aws-sdk/client-ecs';
import { DynamoDBDocumentClient, GetCommand, PutCommand, UpdateCommand } from '@aws-sdk/lib-dynamodb';
import type { APIGatewayProxyEvent, APIGatewayProxyResult } from 'aws-lambda';
import { ulid } from 'ulid';
Expand All @@ -31,10 +32,12 @@ import { computeTtlEpoch } from './shared/validation';

const ddb = DynamoDBDocumentClient.from(new DynamoDBClient({}));
const agentCoreClient = new BedrockAgentCoreClient({});
const ecsClient = new ECSClient({});
const TABLE_NAME = process.env.TASK_TABLE_NAME!;
const EVENTS_TABLE_NAME = process.env.TASK_EVENTS_TABLE_NAME!;
const TASK_RETENTION_DAYS = Number(process.env.TASK_RETENTION_DAYS ?? '90');
const RUNTIME_ARN = process.env.RUNTIME_ARN;
const ECS_CLUSTER_ARN = process.env.ECS_CLUSTER_ARN;

/**
* DELETE /v1/tasks/{task_id} — Cancel a task.
Expand Down Expand Up @@ -107,19 +110,57 @@ export async function handler(event: APIGatewayProxyEvent): Promise<APIGatewayPr
throw condErr;
}

// 6b. Stop AgentCore runtime session so the container winds down (best-effort)
if (wasRunning && runtimeSessionId && agentRuntimeArn) {
try {
await agentCoreClient.send(new StopRuntimeSessionCommand({
runtimeSessionId: runtimeSessionId,
agentRuntimeArn: agentRuntimeArn,
}));
logger.info('StopRuntimeSession invoked after cancel', { task_id: taskId, request_id: requestId });
} catch (stopErr) {
logger.warn('StopRuntimeSession failed after cancel (session may already be gone)', {
// 6b. Stop the compute session so the container winds down (best-effort)
if (wasRunning && runtimeSessionId) {
const computeType = record.compute_type;
if (computeType === 'ecs') {
// ECS-backed task — stop the Fargate task
const clusterArn = record.compute_metadata?.clusterArn ?? ECS_CLUSTER_ARN;
const taskArn = record.compute_metadata?.taskArn;
if (clusterArn && taskArn) {
try {
await ecsClient.send(new StopTaskCommand({
cluster: clusterArn,
task: taskArn,
reason: 'Cancelled by user',
}));
logger.info('ECS StopTask invoked after cancel', { task_id: taskId, ecs_task_arn: taskArn, request_id: requestId });
} catch (stopErr) {
logger.warn('ECS StopTask failed after cancel (task may already be stopped)', {
task_id: taskId,
request_id: requestId,
error: stopErr instanceof Error ? stopErr.message : String(stopErr),
});
}
} else {
logger.warn('ECS task cancel skipped: missing clusterArn or taskArn in compute_metadata', {
task_id: taskId,
request_id: requestId,
has_cluster: !!clusterArn,
has_task: !!taskArn,
});
}
} else if (agentRuntimeArn) {
// AgentCore-backed task (default)
try {
await agentCoreClient.send(new StopRuntimeSessionCommand({
runtimeSessionId: runtimeSessionId,
agentRuntimeArn: agentRuntimeArn,
}));
logger.info('StopRuntimeSession invoked after cancel', { task_id: taskId, request_id: requestId });
} catch (stopErr) {
logger.warn('StopRuntimeSession failed after cancel (session may already be gone)', {
task_id: taskId,
request_id: requestId,
error: stopErr instanceof Error ? stopErr.message : String(stopErr),
});
}
} else {
logger.warn('Running task has no recognized compute backend to stop', {
task_id: taskId,
request_id: requestId,
error: stopErr instanceof Error ? stopErr.message : String(stopErr),
compute_type: computeType,
has_runtime_arn: !!agentRuntimeArn,
});
}
}
Expand Down
Loading
Loading