Skip to content

Commit d970e26

Browse files
authored
feat: introduce evaluation feature (#518)
* feat: add evals control plane operations * feat: add functionality to run evaluation and online evals * fix: eval data plane code clean up * tests: add test coverage for evals * add support for running evals on agents created outside the cli * feat: add eval discovery commands, status enrichment, and schema updates * fix: add API limit of 10 on spanIds and TUI changes * feat: add evals in resourcegraph * feat: add eval TUI screens, online eval dashboard, and run eval wizard * feat: remove stop online-eval command, use remove + deploy instead * chore: remove unused get-eval-run module, add ARN mode tests for pause-resume * docs: update AGENTS.md with eval primitives, clarify sampling rate and default model * fix: bump aws-cdk-lib to 2.243.0 and remove description from UpdateOnlineEvalOptions - Update scaffolded aws-cdk-lib from 2.239.0 to 2.243.0 so CfnEvaluator is available in aws-cdk-lib/aws-bedrockagentcore - Remove description field from UpdateOnlineEvalOptions since description updates should be managed via CDK, not API calls * fix: skip requireProject for run eval in ARN mode * feat: add session discovery and selection step to run eval TUI * feat: improve command docs, rating scale clarity, ARN support messaging * feat: add eval level guidance, sampling rate context, score interpretation, placeholder docs * chore: rename eval command to evals throughout codebase * fix: require --instructions and validate placeholders for add evaluator CLI mode * fix: restore MCP/A2A code lost during rebase conflict resolution, fix dev tests * fix: agents.md
1 parent c51b1e2 commit d970e26

102 files changed

Lines changed: 9940 additions & 68 deletions

File tree

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

AGENTS.md

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -24,12 +24,16 @@ Note: CDK L3 constructs are in a separate package `@aws/agentcore-cdk`.
2424
## CLI Commands
2525

2626
- `create` - Create new AgentCore project
27-
- `add` - Add resources (agent, memory, identity, target)
28-
- `remove` - Remove resources (agent, memory, identity, target, all)
27+
- `add` - Add resources (agent, memory, identity, evaluator, online-eval, target)
28+
- `remove` - Remove resources (agent, memory, identity, evaluator, online-eval, target, all)
2929
- `deploy` - Deploy infrastructure to AWS
3030
- `status` - Check deployment status
3131
- `dev` - Local development server (CodeZip: uvicorn with hot-reload; Container: Docker build + run with volume mount)
3232
- `invoke` - Invoke agents (local or deployed)
33+
- `run eval` - Run on-demand evaluation against agent sessions
34+
- `eval history` - View past eval run results
35+
- `pause online-eval` - Pause (disable) a deployed online eval config
36+
- `resume online-eval` - Resume (enable) a paused online eval config
3337
- `package` - Package agent artifacts without deploying (zip for CodeZip, container image build for Container)
3438
- `validate` - Validate configuration files
3539
- `update` - Check for CLI updates
@@ -60,6 +64,8 @@ Current primitives:
6064
- `AgentPrimitive` — agent creation (template + BYO), removal, credential resolution
6165
- `MemoryPrimitive` — memory creation with strategies, removal
6266
- `CredentialPrimitive` — credential/identity creation, .env management, removal
67+
- `EvaluatorPrimitive` — custom evaluator creation/removal with cross-reference validation
68+
- `OnlineEvalConfigPrimitive` — online eval config creation/removal
6369
- `GatewayPrimitive` — MCP gateway creation/removal
6470
- `GatewayTargetPrimitive` — MCP tool creation/removal with code generation
6571

src/assets/__tests__/__snapshots__/assets.snapshot.test.ts.snap

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -351,7 +351,7 @@ exports[`Assets Directory Snapshots > CDK assets > cdk/cdk/package.json should m
351351
},
352352
"dependencies": {
353353
"@aws/agentcore-cdk": "^0.1.0-alpha.1",
354-
"aws-cdk-lib": "2.239.0",
354+
"aws-cdk-lib": "2.243.0",
355355
"constructs": "^10.0.0"
356356
}
357357
}
@@ -372,6 +372,8 @@ test('AgentCoreStack synthesizes with empty spec', () => {
372372
agents: [],
373373
memories: [],
374374
credentials: [],
375+
evaluators: [],
376+
onlineEvalConfigs: [],
375377
},
376378
});
377379
const template = Template.fromStack(stack);

src/assets/cdk/package.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@
2424
},
2525
"dependencies": {
2626
"@aws/agentcore-cdk": "^0.1.0-alpha.1",
27-
"aws-cdk-lib": "2.239.0",
27+
"aws-cdk-lib": "2.243.0",
2828
"constructs": "^10.0.0"
2929
}
3030
}

src/assets/cdk/test/cdk.test.ts

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,8 @@ test('AgentCoreStack synthesizes with empty spec', () => {
1111
agents: [],
1212
memories: [],
1313
credentials: [],
14+
evaluators: [],
15+
onlineEvalConfigs: [],
1416
},
1517
});
1618
const template = Template.fromStack(stack);

src/cli/aws/__tests__/agentcore-control.test.ts

Lines changed: 308 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,10 @@
1-
import { getAgentRuntimeStatus } from '../agentcore-control.js';
1+
import {
2+
getAgentRuntimeStatus,
3+
getEvaluator,
4+
getOnlineEvaluationConfig,
5+
listEvaluators,
6+
updateOnlineEvalExecutionStatus,
7+
} from '../agentcore-control.js';
28
import { beforeEach, describe, expect, it, vi } from 'vitest';
39

410
const { mockSend } = vi.hoisted(() => ({
@@ -12,6 +18,18 @@ vi.mock('@aws-sdk/client-bedrock-agentcore-control', () => ({
1218
GetAgentRuntimeCommand: class {
1319
constructor(public input: unknown) {}
1420
},
21+
GetEvaluatorCommand: class {
22+
constructor(public input: unknown) {}
23+
},
24+
GetOnlineEvaluationConfigCommand: class {
25+
constructor(public input: unknown) {}
26+
},
27+
ListEvaluatorsCommand: class {
28+
constructor(public input: unknown) {}
29+
},
30+
UpdateOnlineEvaluationConfigCommand: class {
31+
constructor(public input: unknown) {}
32+
},
1533
}));
1634

1735
vi.mock('../account', () => ({
@@ -56,3 +74,292 @@ describe('getAgentRuntimeStatus', () => {
5674
);
5775
});
5876
});
77+
78+
describe('getEvaluator', () => {
79+
beforeEach(() => {
80+
vi.clearAllMocks();
81+
});
82+
83+
it('returns evaluator details', async () => {
84+
mockSend.mockResolvedValue({
85+
evaluatorId: 'eval-123',
86+
evaluatorArn: 'arn:aws:bedrock-agentcore:us-east-1:123456:evaluator/eval-123',
87+
evaluatorName: 'my-evaluator',
88+
level: 'SESSION',
89+
status: 'ACTIVE',
90+
description: 'A test evaluator',
91+
});
92+
93+
const result = await getEvaluator({ region: 'us-east-1', evaluatorId: 'eval-123' });
94+
expect(result.evaluatorId).toBe('eval-123');
95+
expect(result.evaluatorName).toBe('my-evaluator');
96+
expect(result.level).toBe('SESSION');
97+
expect(result.status).toBe('ACTIVE');
98+
expect(result.description).toBe('A test evaluator');
99+
});
100+
101+
it('throws when no evaluatorId in response', async () => {
102+
mockSend.mockResolvedValue({ evaluatorId: undefined });
103+
104+
await expect(getEvaluator({ region: 'us-east-1', evaluatorId: 'eval-missing' })).rejects.toThrow(
105+
'No evaluator found for ID eval-missing'
106+
);
107+
});
108+
109+
it('passes correct evaluatorId in command', async () => {
110+
mockSend.mockResolvedValue({
111+
evaluatorId: 'eval-abc',
112+
evaluatorName: 'test',
113+
level: 'TRACE',
114+
status: 'ACTIVE',
115+
});
116+
117+
await getEvaluator({ region: 'us-west-2', evaluatorId: 'eval-abc' });
118+
119+
const command = mockSend.mock.calls[0]![0];
120+
expect(command.input.evaluatorId).toBe('eval-abc');
121+
});
122+
123+
it('defaults level to SESSION when undefined', async () => {
124+
mockSend.mockResolvedValue({
125+
evaluatorId: 'eval-no-level',
126+
level: undefined,
127+
status: 'ACTIVE',
128+
});
129+
130+
const result = await getEvaluator({ region: 'us-east-1', evaluatorId: 'eval-no-level' });
131+
expect(result.level).toBe('SESSION');
132+
});
133+
134+
it('propagates SDK errors', async () => {
135+
mockSend.mockRejectedValue(new Error('AccessDenied'));
136+
137+
await expect(getEvaluator({ region: 'us-east-1', evaluatorId: 'eval-err' })).rejects.toThrow('AccessDenied');
138+
});
139+
});
140+
141+
describe('updateOnlineEvalExecutionStatus', () => {
142+
beforeEach(() => {
143+
vi.clearAllMocks();
144+
});
145+
146+
it('sends DISABLED to pause and returns result', async () => {
147+
mockSend.mockResolvedValue({
148+
onlineEvaluationConfigId: 'cfg-123',
149+
executionStatus: 'DISABLED',
150+
status: 'ACTIVE',
151+
});
152+
153+
const result = await updateOnlineEvalExecutionStatus({
154+
region: 'us-east-1',
155+
onlineEvaluationConfigId: 'cfg-123',
156+
executionStatus: 'DISABLED',
157+
});
158+
159+
expect(result.configId).toBe('cfg-123');
160+
expect(result.executionStatus).toBe('DISABLED');
161+
expect(result.status).toBe('ACTIVE');
162+
});
163+
164+
it('sends ENABLED to resume', async () => {
165+
mockSend.mockResolvedValue({
166+
onlineEvaluationConfigId: 'cfg-456',
167+
executionStatus: 'ENABLED',
168+
status: 'ACTIVE',
169+
});
170+
171+
const result = await updateOnlineEvalExecutionStatus({
172+
region: 'us-west-2',
173+
onlineEvaluationConfigId: 'cfg-456',
174+
executionStatus: 'ENABLED',
175+
});
176+
177+
expect(result.configId).toBe('cfg-456');
178+
expect(result.executionStatus).toBe('ENABLED');
179+
});
180+
181+
it('passes correct params in command', async () => {
182+
mockSend.mockResolvedValue({
183+
onlineEvaluationConfigId: 'cfg-789',
184+
executionStatus: 'DISABLED',
185+
status: 'ACTIVE',
186+
});
187+
188+
await updateOnlineEvalExecutionStatus({
189+
region: 'us-east-1',
190+
onlineEvaluationConfigId: 'cfg-789',
191+
executionStatus: 'DISABLED',
192+
});
193+
194+
const command = mockSend.mock.calls[0]![0];
195+
expect(command.input.onlineEvaluationConfigId).toBe('cfg-789');
196+
expect(command.input.executionStatus).toBe('DISABLED');
197+
});
198+
199+
it('falls back to input values when response fields are undefined', async () => {
200+
mockSend.mockResolvedValue({});
201+
202+
const result = await updateOnlineEvalExecutionStatus({
203+
region: 'us-east-1',
204+
onlineEvaluationConfigId: 'cfg-fallback',
205+
executionStatus: 'ENABLED',
206+
});
207+
208+
expect(result.configId).toBe('cfg-fallback');
209+
expect(result.executionStatus).toBe('ENABLED');
210+
expect(result.status).toBe('UNKNOWN');
211+
});
212+
213+
it('propagates SDK errors', async () => {
214+
mockSend.mockRejectedValue(new Error('Throttling'));
215+
216+
await expect(
217+
updateOnlineEvalExecutionStatus({
218+
region: 'us-east-1',
219+
onlineEvaluationConfigId: 'cfg-err',
220+
executionStatus: 'DISABLED',
221+
})
222+
).rejects.toThrow('Throttling');
223+
});
224+
});
225+
226+
describe('getOnlineEvaluationConfig', () => {
227+
beforeEach(() => {
228+
vi.clearAllMocks();
229+
});
230+
231+
it('returns config details with output log group', async () => {
232+
mockSend.mockResolvedValue({
233+
onlineEvaluationConfigId: 'oec-123',
234+
onlineEvaluationConfigArn: 'arn:aws:bedrock-agentcore:us-east-1:123456:online-eval/oec-123',
235+
onlineEvaluationConfigName: 'my-online-eval',
236+
status: 'ACTIVE',
237+
executionStatus: 'ENABLED',
238+
description: 'Production eval',
239+
outputConfig: {
240+
cloudWatchConfig: { logGroupName: '/aws/bedrock-agentcore/evaluations/oec-123' },
241+
},
242+
});
243+
244+
const result = await getOnlineEvaluationConfig({ region: 'us-east-1', configId: 'oec-123' });
245+
expect(result.configId).toBe('oec-123');
246+
expect(result.configName).toBe('my-online-eval');
247+
expect(result.status).toBe('ACTIVE');
248+
expect(result.executionStatus).toBe('ENABLED');
249+
expect(result.description).toBe('Production eval');
250+
expect(result.outputLogGroupName).toBe('/aws/bedrock-agentcore/evaluations/oec-123');
251+
});
252+
253+
it('throws when no configId in response', async () => {
254+
mockSend.mockResolvedValue({ onlineEvaluationConfigId: undefined });
255+
256+
await expect(getOnlineEvaluationConfig({ region: 'us-east-1', configId: 'oec-missing' })).rejects.toThrow(
257+
'No online evaluation config found for ID oec-missing'
258+
);
259+
});
260+
261+
it('returns failureReason when present', async () => {
262+
mockSend.mockResolvedValue({
263+
onlineEvaluationConfigId: 'oec-fail',
264+
onlineEvaluationConfigName: 'broken-eval',
265+
status: 'CREATE_FAILED',
266+
executionStatus: 'DISABLED',
267+
failureReason: 'IAM role not found',
268+
});
269+
270+
const result = await getOnlineEvaluationConfig({ region: 'us-east-1', configId: 'oec-fail' });
271+
expect(result.status).toBe('CREATE_FAILED');
272+
expect(result.failureReason).toBe('IAM role not found');
273+
});
274+
275+
it('handles missing outputConfig', async () => {
276+
mockSend.mockResolvedValue({
277+
onlineEvaluationConfigId: 'oec-no-output',
278+
status: 'CREATING',
279+
executionStatus: 'DISABLED',
280+
});
281+
282+
const result = await getOnlineEvaluationConfig({ region: 'us-east-1', configId: 'oec-no-output' });
283+
expect(result.outputLogGroupName).toBeUndefined();
284+
});
285+
286+
it('passes correct configId in command', async () => {
287+
mockSend.mockResolvedValue({
288+
onlineEvaluationConfigId: 'oec-abc',
289+
status: 'ACTIVE',
290+
executionStatus: 'ENABLED',
291+
});
292+
293+
await getOnlineEvaluationConfig({ region: 'us-west-2', configId: 'oec-abc' });
294+
295+
const command = mockSend.mock.calls[0]![0];
296+
expect(command.input.onlineEvaluationConfigId).toBe('oec-abc');
297+
});
298+
299+
it('propagates SDK errors', async () => {
300+
mockSend.mockRejectedValue(new Error('ResourceNotFoundException'));
301+
302+
await expect(getOnlineEvaluationConfig({ region: 'us-east-1', configId: 'oec-err' })).rejects.toThrow(
303+
'ResourceNotFoundException'
304+
);
305+
});
306+
});
307+
308+
describe('listEvaluators', () => {
309+
beforeEach(() => {
310+
vi.clearAllMocks();
311+
});
312+
313+
it('returns evaluator summaries', async () => {
314+
mockSend.mockResolvedValue({
315+
evaluators: [
316+
{
317+
evaluatorId: 'eval-1',
318+
evaluatorArn: 'arn:aws:bedrock-agentcore:us-east-1:123456:evaluator/eval-1',
319+
evaluatorName: 'Faithfulness',
320+
evaluatorType: 'Builtin',
321+
status: 'ACTIVE',
322+
},
323+
{
324+
evaluatorId: 'eval-2',
325+
evaluatorArn: 'arn:aws:bedrock-agentcore:us-east-1:123456:evaluator/eval-2',
326+
evaluatorName: 'my-custom',
327+
evaluatorType: 'Custom',
328+
status: 'ACTIVE',
329+
description: 'A custom evaluator',
330+
},
331+
],
332+
});
333+
334+
const result = await listEvaluators({ region: 'us-east-1' });
335+
expect(result.evaluators).toHaveLength(2);
336+
expect(result.evaluators[0]!.evaluatorName).toBe('Faithfulness');
337+
expect(result.evaluators[0]!.evaluatorType).toBe('Builtin');
338+
expect(result.evaluators[1]!.evaluatorName).toBe('my-custom');
339+
expect(result.evaluators[1]!.description).toBe('A custom evaluator');
340+
});
341+
342+
it('returns empty array when no evaluators', async () => {
343+
mockSend.mockResolvedValue({ evaluators: undefined });
344+
345+
const result = await listEvaluators({ region: 'us-east-1' });
346+
expect(result.evaluators).toEqual([]);
347+
});
348+
349+
it('passes maxResults and nextToken', async () => {
350+
mockSend.mockResolvedValue({ evaluators: [], nextToken: 'token-2' });
351+
352+
const result = await listEvaluators({ region: 'us-east-1', maxResults: 5, nextToken: 'token-1' });
353+
354+
const command = mockSend.mock.calls[0]![0];
355+
expect(command.input.maxResults).toBe(5);
356+
expect(command.input.nextToken).toBe('token-1');
357+
expect(result.nextToken).toBe('token-2');
358+
});
359+
360+
it('propagates SDK errors', async () => {
361+
mockSend.mockRejectedValue(new Error('AccessDeniedException'));
362+
363+
await expect(listEvaluators({ region: 'us-east-1' })).rejects.toThrow('AccessDeniedException');
364+
});
365+
});

0 commit comments

Comments
 (0)