Skip to content

Commit 11ca658

Browse files
authored
feat: add code-based evaluator support (#739)
* feat: add code-based evaluator support Add managed and external code-based evaluator support across schema, CLI flags, TUI wizard, and template scaffolding. Block code-based evaluators from online eval configs at schema, CLI, and TUI layers. * temp: use pyproject.toml with vendored SDK wheel Vendor the SDK wheel and add binary-aware template rendering until the SDK is published to PyPI. To be removed once the SDK is publicly available. * fix: update asset snapshot and regenerate package-lock.json - Update asset file listing snapshot for new evaluator templates - Regenerate package-lock.json to fix stale aws-cdk bundled dep (@aws-cdk/cloud-assembly-schema 52.2.0 -> 53.11.0) * fix: show correct evaluator type in status display Status command was hardcoding "LLM-as-a-Judge" for all evaluators. Now derives the label from item.config.codeBased to distinguish code-based evaluators. * feat: add additionalPolicies field to managed code-based evaluator config Add additionalPolicies to ManagedCodeBasedConfigSchema supporting both inline .json policy files and managed policy ARNs. Auto-populate with execution-role-policy.json when scaffolding managed evaluators. * revert: remove vendored wheel support and requirements.txt The SDK is now on PyPI (bedrock-agentcore>=1.6.0). Remove: - Binary-aware template rendering (.whl copy logic) - Vendored wheels from evaluator assets - requirements.txt references from scaffold messages pyproject.toml now pulls directly from PyPI. * fix: remove vendored wheel and pin bedrock-agentcore>=1.6.0 Remove the last vendored .whl from src/assets and update pyproject.toml to require bedrock-agentcore>=1.6.0 from PyPI. Update asset snapshot accordingly.
1 parent 1a45c28 commit 11ca658

File tree

20 files changed

+1332
-869
lines changed

20 files changed

+1332
-869
lines changed

package-lock.json

Lines changed: 679 additions & 735 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

src/assets/__tests__/__snapshots__/assets.snapshot.test.ts.snap

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -446,6 +446,9 @@ exports[`Assets Directory Snapshots > File listing > should match the expected f
446446
"cdk/tsconfig.json",
447447
"container/python/Dockerfile",
448448
"container/python/dockerignore.template",
449+
"evaluators/python-lambda/execution-role-policy.json",
450+
"evaluators/python-lambda/lambda_function.py",
451+
"evaluators/python-lambda/pyproject.toml",
449452
"mcp/python-lambda/README.md",
450453
"mcp/python-lambda/handler.py",
451454
"mcp/python-lambda/pyproject.toml",
Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
{
2+
"Version": "2012-10-17",
3+
"Statement": [
4+
{
5+
"Effect": "Allow",
6+
"Action": ["logs:CreateLogGroup", "logs:CreateLogStream", "logs:PutLogEvents"],
7+
"Resource": "arn:aws:logs:*:*:log-group:/aws/lambda/*"
8+
}
9+
]
10+
}
Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
from bedrock_agentcore.evaluation.custom_code_based_evaluators import (
2+
custom_code_based_evaluator,
3+
EvaluatorInput,
4+
EvaluatorOutput,
5+
)
6+
7+
8+
@custom_code_based_evaluator()
9+
def handler(input: EvaluatorInput, context) -> EvaluatorOutput:
10+
"""Evaluate agent behavior with custom logic.
11+
12+
Args:
13+
input: Contains evaluation_level, session_spans, target_trace_id, target_span_id
14+
15+
Returns:
16+
EvaluatorOutput with value/label for success, or errorCode/errorMessage for failure.
17+
"""
18+
# TODO: Replace with your evaluation logic
19+
return EvaluatorOutput(value=1.0, label="Pass", explanation="Evaluation passed")
Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
[build-system]
2+
requires = ["hatchling"]
3+
build-backend = "hatchling.build"
4+
5+
[project]
6+
name = "{{ Name }}"
7+
version = "0.1.0"
8+
description = "AgentCore Code-Based Evaluator"
9+
requires-python = ">=3.10"
10+
dependencies = [
11+
"bedrock-agentcore>=1.6.0",
12+
]
13+
14+
[tool.hatch.build.targets.wheel]
15+
packages = ["."]

src/cli/commands/status/__tests__/action.test.ts

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -322,6 +322,19 @@ describe('computeResourceStatuses', () => {
322322
expect(evalEntry!.detail).toBe('TRACE — LLM-as-a-Judge');
323323
});
324324

325+
it('shows Code-based detail for code-based evaluator', () => {
326+
const project = {
327+
...baseProject,
328+
evaluators: [{ name: 'CodeEval', level: 'SESSION', config: { codeBased: { managed: {} } } }],
329+
} as unknown as AgentCoreProjectSpec;
330+
331+
const result = computeResourceStatuses(project, undefined);
332+
const evalEntry = result.find(r => r.resourceType === 'evaluator' && r.name === 'CodeEval');
333+
334+
expect(evalEntry).toBeDefined();
335+
expect(evalEntry!.detail).toBe('SESSION — Code-based');
336+
});
337+
325338
it('marks evaluator as pending-removal when deployed but removed from schema', () => {
326339
const resources: DeployedResourceState = {
327340
evaluators: {

src/cli/commands/status/action.ts

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -160,7 +160,7 @@ export function computeResourceStatuses(
160160
localItems: project.evaluators ?? [],
161161
deployedRecord: resources?.evaluators ?? {},
162162
getIdentifier: deployed => deployed.evaluatorArn,
163-
getLocalDetail: item => `${item.level} — LLM-as-a-Judge`,
163+
getLocalDetail: item => `${item.level}${item.config.codeBased ? 'Code-based' : 'LLM-as-a-Judge'}`,
164164
});
165165

166166
const onlineEvalConfigs = diffResourceSet({

src/cli/primitives/EvaluatorPrimitive.ts

Lines changed: 112 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@ import type { EvaluationLevel, Evaluator, EvaluatorConfig } from '../../schema';
33
import { EvaluationLevelSchema, EvaluatorSchema } from '../../schema';
44
import { getErrorMessage } from '../errors';
55
import type { RemovalPreview, RemovalResult, SchemaChange } from '../operations/remove/types';
6+
import { renderCodeBasedEvaluatorTemplate } from '../templates/EvaluatorRenderer';
67
import {
78
LEVEL_PLACEHOLDERS,
89
RATING_SCALE_PRESETS,
@@ -12,6 +13,9 @@ import {
1213
import { BasePrimitive } from './BasePrimitive';
1314
import type { AddResult, AddScreenComponent, RemovableResource } from './types';
1415
import type { Command } from '@commander-js/extra-typings';
16+
import { existsSync } from 'node:fs';
17+
import { rm } from 'node:fs/promises';
18+
import { dirname, join } from 'node:path';
1519

1620
export interface AddEvaluatorOptions {
1721
name: string;
@@ -22,6 +26,9 @@ export interface AddEvaluatorOptions {
2226

2327
export type RemovableEvaluator = RemovableResource;
2428

29+
const DEFAULT_CODE_ENTRYPOINT = 'lambda_function.handler';
30+
const DEFAULT_CODE_TIMEOUT = 60;
31+
2532
/**
2633
* EvaluatorPrimitive handles all evaluator add/remove operations.
2734
*/
@@ -31,9 +38,20 @@ export class EvaluatorPrimitive extends BasePrimitive<AddEvaluatorOptions, Remov
3138
override readonly article = 'an';
3239
readonly primitiveSchema = EvaluatorSchema;
3340

34-
async add(options: AddEvaluatorOptions): Promise<AddResult<{ evaluatorName: string }>> {
41+
async add(options: AddEvaluatorOptions): Promise<AddResult<{ evaluatorName: string; codePath?: string }>> {
3542
try {
3643
const evaluator = await this.createEvaluator(options);
44+
45+
// Scaffold code for managed code-based evaluators
46+
if (options.config.codeBased?.managed) {
47+
const configRoot = findConfigRoot()!;
48+
const projectRoot = dirname(configRoot);
49+
const codeLocation = options.config.codeBased.managed.codeLocation;
50+
const targetDir = join(projectRoot, codeLocation);
51+
await renderCodeBasedEvaluatorTemplate(options.name, targetDir);
52+
return { success: true, evaluatorName: evaluator.name, codePath: codeLocation };
53+
}
54+
3755
return { success: true, evaluatorName: evaluator.name };
3856
} catch (err) {
3957
return { success: false, error: getErrorMessage(err) };
@@ -59,6 +77,17 @@ export class EvaluatorPrimitive extends BasePrimitive<AddEvaluatorOptions, Remov
5977
};
6078
}
6179

80+
// Delete scaffolded code directory for managed code-based evaluators
81+
const evaluator = project.evaluators[index]!;
82+
if (evaluator.config.codeBased?.managed) {
83+
const configRoot = findConfigRoot()!;
84+
const projectRoot = dirname(configRoot);
85+
const codeDir = join(projectRoot, evaluator.config.codeBased.managed.codeLocation);
86+
if (existsSync(codeDir)) {
87+
await rm(codeDir, { recursive: true, force: true });
88+
}
89+
}
90+
6291
project.evaluators.splice(index, 1);
6392
await this.writeProjectSpec(project);
6493

@@ -77,6 +106,7 @@ export class EvaluatorPrimitive extends BasePrimitive<AddEvaluatorOptions, Remov
77106
}
78107

79108
const summary: string[] = [`Removing evaluator: ${evaluatorName}`];
109+
const directoriesToDelete: string[] = [];
80110
const schemaChanges: SchemaChange[] = [];
81111

82112
const referencingConfigs = project.onlineEvalConfigs.filter(c => c.evaluators.includes(evaluatorName));
@@ -86,6 +116,18 @@ export class EvaluatorPrimitive extends BasePrimitive<AddEvaluatorOptions, Remov
86116
);
87117
}
88118

119+
// Preview code directory deletion for managed code-based evaluators
120+
if (evaluator.config.codeBased?.managed) {
121+
const configRoot = findConfigRoot()!;
122+
const projectRoot = dirname(configRoot);
123+
const codeLocation = evaluator.config.codeBased.managed.codeLocation;
124+
const codeDir = join(projectRoot, codeLocation);
125+
if (existsSync(codeDir)) {
126+
directoriesToDelete.push(codeLocation);
127+
summary.push(`Will delete directory: ${codeLocation}`);
128+
}
129+
}
130+
89131
const afterSpec = {
90132
...project,
91133
evaluators: project.evaluators.filter(e => e.name !== evaluatorName),
@@ -97,7 +139,7 @@ export class EvaluatorPrimitive extends BasePrimitive<AddEvaluatorOptions, Remov
97139
after: afterSpec,
98140
});
99141

100-
return { summary, directoriesToDelete: [], schemaChanges };
142+
return { summary, directoriesToDelete, schemaChanges };
101143
}
102144

103145
async getRemovable(): Promise<RemovableEvaluator[]> {
@@ -124,17 +166,17 @@ export class EvaluatorPrimitive extends BasePrimitive<AddEvaluatorOptions, Remov
124166
addCmd
125167
.command(this.kind)
126168
.description('Add a custom evaluator to the project')
127-
.option('--name <name>', 'Evaluator name [non-interactive]')
128-
.option('--level <level>', 'Evaluation level: SESSION, TRACE, TOOL_CALL [non-interactive]')
129-
.option('--model <model>', 'Bedrock model ID for LLM-as-a-Judge [non-interactive]')
169+
.option('--name <name>', 'Evaluator name')
170+
.option('--level <level>', 'Evaluation level: SESSION, TRACE, TOOL_CALL')
171+
.option('--type <type>', 'Evaluator type: llm-as-a-judge (default) or code-based')
172+
.option('--model <model>', '[LLM] Bedrock model ID for LLM-as-a-Judge')
130173
.option(
131174
'--instructions <text>',
132-
'Evaluation prompt instructions (must include level-appropriate placeholders, e.g. {context}) [non-interactive]'
133-
)
134-
.option(
135-
'--rating-scale <preset>',
136-
`Rating scale preset: ${presetIds.join(', ')} (default: 1-5-quality) [non-interactive]`
175+
'[LLM] Evaluation prompt instructions (must include level-appropriate placeholders, e.g. {context})'
137176
)
177+
.option('--rating-scale <preset>', `[LLM] Rating scale preset: ${presetIds.join(', ')} (default: 1-5-quality)`)
178+
.option('--lambda-arn <arn>', '[Code-based] Existing Lambda function ARN (external)')
179+
.option('--timeout <seconds>', '[Code-based] Lambda timeout in seconds, 1-300 (default: 60)')
138180
.option(
139181
'--config <path>',
140182
'Path to evaluator config JSON file (overrides --model, --instructions, --rating-scale) [non-interactive]'
@@ -144,9 +186,12 @@ export class EvaluatorPrimitive extends BasePrimitive<AddEvaluatorOptions, Remov
144186
async (cliOptions: {
145187
name?: string;
146188
level?: string;
189+
type?: string;
147190
model?: string;
148191
instructions?: string;
149192
ratingScale?: string;
193+
lambdaArn?: string;
194+
timeout?: string;
150195
config?: string;
151196
json?: boolean;
152197
}) => {
@@ -170,21 +215,40 @@ export class EvaluatorPrimitive extends BasePrimitive<AddEvaluatorOptions, Remov
170215
fail('--name and --level are required in non-interactive mode');
171216
}
172217

173-
if (!cliOptions.config && !cliOptions.model) {
174-
fail('Either --config or --model is required');
175-
}
176-
177218
const levelResult = EvaluationLevelSchema.safeParse(cliOptions.level);
178219
if (!levelResult.success) {
179220
fail(`Invalid --level "${cliOptions.level}". Must be one of: SESSION, TRACE, TOOL_CALL`);
180221
}
181222

223+
const evalType = cliOptions.type ?? 'llm-as-a-judge';
224+
if (evalType !== 'llm-as-a-judge' && evalType !== 'code-based') {
225+
fail(`Invalid --type "${evalType}". Must be one of: llm-as-a-judge, code-based`);
226+
}
227+
228+
// Cross-validate flags against evaluator type
229+
if (evalType !== 'code-based') {
230+
if (cliOptions.lambdaArn) fail('--lambda-arn requires --type code-based');
231+
if (cliOptions.timeout) fail('--timeout requires --type code-based');
232+
}
233+
if (evalType === 'code-based') {
234+
if (cliOptions.model) fail('--model cannot be used with --type code-based');
235+
if (cliOptions.instructions) fail('--instructions cannot be used with --type code-based');
236+
if (cliOptions.ratingScale) fail('--rating-scale cannot be used with --type code-based');
237+
}
238+
182239
let configJson: EvaluatorConfig;
240+
183241
if (cliOptions.config) {
184242
const { readFileSync } = await import('fs');
185243
configJson = JSON.parse(readFileSync(cliOptions.config, 'utf-8')) as EvaluatorConfig;
244+
} else if (evalType === 'code-based') {
245+
configJson = this.buildCodeBasedConfig(cliOptions.name!, cliOptions.lambdaArn, cliOptions.timeout);
186246
} else {
187-
// --instructions is required when not using --config
247+
// LLM-as-a-Judge flow
248+
if (!cliOptions.model) {
249+
fail('Either --config or --model is required for LLM-as-a-Judge evaluators');
250+
}
251+
188252
if (!cliOptions.instructions) {
189253
const level = levelResult.data!;
190254
const placeholders = LEVEL_PLACEHOLDERS[level].map(p => `{${p}}`).join(', ');
@@ -194,21 +258,18 @@ export class EvaluatorPrimitive extends BasePrimitive<AddEvaluatorOptions, Remov
194258
);
195259
}
196260

197-
// Validate placeholders
198261
const placeholderCheck = validateInstructionPlaceholders(cliOptions.instructions!, levelResult.data!);
199262
if (placeholderCheck !== true) {
200263
fail(placeholderCheck);
201264
}
202265

203-
// Resolve rating scale
204-
let ratingScale: EvaluatorConfig['llmAsAJudge']['ratingScale'];
266+
let ratingScale: NonNullable<EvaluatorConfig['llmAsAJudge']>['ratingScale'];
205267
const scaleInput = cliOptions.ratingScale ?? '1-5-quality';
206268

207269
const preset = RATING_SCALE_PRESETS.find(p => p.id === scaleInput);
208270
if (preset) {
209271
ratingScale = preset.ratingScale;
210272
} else {
211-
// Try parsing as custom format: "1:Poor:Fails, 2:Fair:Partially meets" or "Pass:Meets, Fail:Does not"
212273
const isNumerical = /^\d/.test(scaleInput.trim());
213274
const parsed = parseCustomRatingScale(scaleInput, isNumerical ? 'numerical' : 'categorical');
214275
if (!parsed.success) {
@@ -239,7 +300,16 @@ export class EvaluatorPrimitive extends BasePrimitive<AddEvaluatorOptions, Remov
239300
if (cliOptions.json) {
240301
console.log(JSON.stringify(result));
241302
} else if (result.success) {
242-
console.log(`Added evaluator '${result.evaluatorName}'`);
303+
if (result.codePath) {
304+
console.log(`Created evaluator '${result.evaluatorName}'`);
305+
console.log(` Code: ${result.codePath}lambda_function.py`);
306+
console.log(` IAM: ${result.codePath}execution-role-policy.json`);
307+
console.log(
308+
`\n Next: Edit lambda_function.py with your evaluation logic, then run \`agentcore deploy\``
309+
);
310+
} else {
311+
console.log(`Added evaluator '${result.evaluatorName}'`);
312+
}
243313
} else {
244314
console.error(result.error);
245315
}
@@ -280,6 +350,28 @@ export class EvaluatorPrimitive extends BasePrimitive<AddEvaluatorOptions, Remov
280350
return null;
281351
}
282352

353+
private buildCodeBasedConfig(name: string, lambdaArn?: string, timeoutStr?: string): EvaluatorConfig {
354+
if (lambdaArn) {
355+
return {
356+
codeBased: {
357+
external: { lambdaArn },
358+
},
359+
};
360+
}
361+
362+
const timeoutSeconds = timeoutStr ? parseInt(timeoutStr, 10) : DEFAULT_CODE_TIMEOUT;
363+
return {
364+
codeBased: {
365+
managed: {
366+
codeLocation: `app/${name}/`,
367+
entrypoint: DEFAULT_CODE_ENTRYPOINT,
368+
timeoutSeconds,
369+
additionalPolicies: ['execution-role-policy.json'],
370+
},
371+
},
372+
};
373+
}
374+
283375
private async createEvaluator(options: AddEvaluatorOptions): Promise<Evaluator> {
284376
const project = await this.readProjectSpec();
285377

src/cli/primitives/OnlineEvalConfigPrimitive.ts

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -210,6 +210,18 @@ export class OnlineEvalConfigPrimitive extends BasePrimitive<AddOnlineEvalConfig
210210

211211
this.checkDuplicate(project.onlineEvalConfigs, options.name, 'Online eval config');
212212

213+
// Block code-based evaluators — only LLM-as-a-Judge evaluators are supported for online evaluation.
214+
// Checks local project config. ARN-based evaluators are filtered in the TUI by API evaluatorType.
215+
// TODO: For ARN-based evaluators in non-interactive mode, call getEvaluator to check type.
216+
for (const evalName of options.evaluators) {
217+
const evaluator = project.evaluators.find(e => e.name === evalName);
218+
if (evaluator?.config.codeBased) {
219+
throw new Error(
220+
`Code-based evaluator "${evalName}" cannot be used in online eval configs. Only LLM-as-a-Judge evaluators are supported for online evaluation.`
221+
);
222+
}
223+
}
224+
213225
const config: OnlineEvalConfig = {
214226
name: options.name,
215227
agent: options.agent,

src/cli/primitives/__tests__/EvaluatorPrimitive.test.ts

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,15 @@ const validConfig: EvaluatorConfig = {
2626
},
2727
};
2828

29+
function makeEvaluator(name: string, config?: EvaluatorConfig) {
30+
return {
31+
name,
32+
type: 'CustomEvaluator',
33+
level: 'SESSION',
34+
config: config ?? validConfig,
35+
};
36+
}
37+
2938
function makeProject(
3039
evaluators: { name: string }[] = [],
3140
onlineEvalConfigs: { name: string; evaluators: string[] }[] = []
@@ -37,7 +46,7 @@ function makeProject(
3746
runtimes: [],
3847
memories: [],
3948
credentials: [],
40-
evaluators,
49+
evaluators: evaluators.map(e => ('config' in e ? e : makeEvaluator(e.name))),
4150
onlineEvalConfigs,
4251
};
4352
}

0 commit comments

Comments
 (0)