Skip to content

Commit e985da2

Browse files
author
AWS
committed
Amazon Bedrock AgentCore Update: Adds Ground Truth support for AgentCore Evaluations (Evaluate)
1 parent bb61cd4 commit e985da2

File tree

2 files changed

+122
-27
lines changed

2 files changed

+122
-27
lines changed
Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
{
2+
"type": "feature",
3+
"category": "Amazon Bedrock AgentCore",
4+
"contributor": "",
5+
"description": "Adds Ground Truth support for AgentCore Evaluations (Evaluate)"
6+
}

services/bedrockagentcore/src/main/resources/codegen-resources/service-2.json

Lines changed: 116 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -409,7 +409,7 @@
409409
{"shape":"ResourceNotFoundException"},
410410
{"shape":"InternalServerException"}
411411
],
412-
"documentation":"<p>Executes a command in a runtime session container. Returns streaming output with contentStart, contentDelta, and contentStop events.</p>"
412+
"documentation":"<p>Executes a command in a runtime session container and streams the output back to the caller. This operation allows you to run shell commands within the agent runtime environment and receive real-time streaming responses including standard output and standard error.</p> <p>To invoke a command, you must specify the agent runtime ARN and a runtime session ID. The command execution supports streaming responses, allowing you to receive output as it becomes available through <code>contentStart</code>, <code>contentDelta</code>, and <code>contentStop</code> events.</p> <p>To use this operation, you must have the <code>bedrock-agentcore:InvokeAgentRuntimeCommand</code> permission.</p>"
413413
},
414414
"InvokeCodeInterpreter":{
415415
"name":"InvokeCodeInterpreter",
@@ -1399,19 +1399,19 @@
13991399
"members":{
14001400
"stdout":{
14011401
"shape":"String",
1402-
"documentation":"<p>Standard output content</p>"
1402+
"documentation":"<p>The standard output content from the command execution. This field contains the incremental output written to stdout by the executing command.</p>"
14031403
},
14041404
"stderr":{
14051405
"shape":"String",
1406-
"documentation":"<p>Standard error content</p>"
1406+
"documentation":"<p>The standard error content from the command execution. This field contains the incremental output written to stderr by the executing command.</p>"
14071407
}
14081408
},
1409-
"documentation":"<p>Content event containing stdout or stderr output</p>"
1409+
"documentation":"<p>An event that contains incremental output from a command execution. This event streams standard output and standard error content as it becomes available during command execution.</p>"
14101410
},
14111411
"ContentStartEvent":{
14121412
"type":"structure",
14131413
"members":{},
1414-
"documentation":"<p>First event indicating command execution has started</p>"
1414+
"documentation":"<p>An event that signals the start of content streaming from a command execution. This event is sent when the command begins producing output.</p>"
14151415
},
14161416
"ContentStopEvent":{
14171417
"type":"structure",
@@ -1422,14 +1422,14 @@
14221422
"members":{
14231423
"exitCode":{
14241424
"shape":"Integer",
1425-
"documentation":"<p>Exit code: 0 = success, -1 = platform error, &gt;0 = command error</p>"
1425+
"documentation":"<p>The exit code returned by the executed command. An exit code of 0 indicates successful execution, -1 indicates a platform error, and values greater than 0 indicate command-specific errors.</p>"
14261426
},
14271427
"status":{
14281428
"shape":"CommandExecutionStatus",
1429-
"documentation":"<p>Execution status</p>"
1429+
"documentation":"<p>The final status of the command execution. Valid values are <code>COMPLETED</code> for successful completion or <code>TIMED_OUT</code> if the command exceeded the specified timeout.</p>"
14301430
}
14311431
},
1432-
"documentation":"<p>Final event indicating command execution has completed</p>"
1432+
"documentation":"<p>An event that signals the completion of a command execution. This event contains the final status and exit code of the executed command.</p>"
14331433
},
14341434
"ContentTextString":{
14351435
"type":"string",
@@ -1679,6 +1679,10 @@
16791679
"evaluationTarget":{
16801680
"shape":"EvaluationTarget",
16811681
"documentation":"<p> The specific trace or span IDs to evaluate within the provided input. Allows targeting evaluation at different levels: individual tool calls, single request-response interactions (traces), or entire conversation sessions. </p>"
1682+
},
1683+
"evaluationReferenceInputs":{
1684+
"shape":"EvaluationReferenceInputs",
1685+
"documentation":"<p> Ground truth data to compare against agent responses during evaluation. Allows to provide expected responses, assertions, and expected tool trajectories at different evaluation levels. Session-level reference inputs apply to the entire conversation, while trace-level reference inputs target specific request-response interactions identified by trace ID. </p>"
16821686
}
16831687
}
16841688
},
@@ -1692,6 +1696,28 @@
16921696
}
16931697
}
16941698
},
1699+
"EvaluationContent":{
1700+
"type":"structure",
1701+
"members":{
1702+
"text":{
1703+
"shape":"EvaluationContentTextString",
1704+
"documentation":"<p> The text content of the ground truth data. Used for expected response text and assertion statements. </p>"
1705+
}
1706+
},
1707+
"documentation":"<p> A content block for ground truth data in evaluation reference inputs. Supports text content for expected responses and assertions. </p>",
1708+
"union":true
1709+
},
1710+
"EvaluationContentList":{
1711+
"type":"list",
1712+
"member":{"shape":"EvaluationContent"},
1713+
"max":100,
1714+
"min":1
1715+
},
1716+
"EvaluationContentTextString":{
1717+
"type":"string",
1718+
"max":100000,
1719+
"min":1
1720+
},
16951721
"EvaluationErrorCode":{
16961722
"type":"string",
16971723
"max":1024,
@@ -1702,6 +1728,16 @@
17021728
"max":2048,
17031729
"min":0
17041730
},
1731+
"EvaluationExpectedTrajectory":{
1732+
"type":"structure",
1733+
"members":{
1734+
"toolNames":{
1735+
"shape":"EvaluationToolNames",
1736+
"documentation":"<p> The list of tool names representing the expected tool call sequence. </p>"
1737+
}
1738+
},
1739+
"documentation":"<p> The expected tool call trajectory for trajectory-based evaluation. </p>"
1740+
},
17051741
"EvaluationExplanation":{
17061742
"type":"string",
17071743
"max":2048,
@@ -1719,6 +1755,33 @@
17191755
"documentation":"<p> The input data structure containing agent session spans in OpenTelemetry format. Supports traces from frameworks like Strands (AgentCore Runtime) and LangGraph with OpenInference instrumentation for comprehensive evaluation. </p>",
17201756
"union":true
17211757
},
1758+
"EvaluationReferenceInput":{
1759+
"type":"structure",
1760+
"required":["context"],
1761+
"members":{
1762+
"context":{"shape":"Context"},
1763+
"expectedResponse":{
1764+
"shape":"EvaluationContent",
1765+
"documentation":"<p> The expected response for trace-level evaluation. Built-in evaluators that support this field compare the agent's actual response against this value for assessment. Custom evaluators can access it through the <code>{expected_response}</code> placeholder in their instructions. </p>"
1766+
},
1767+
"assertions":{
1768+
"shape":"EvaluationContentList",
1769+
"documentation":"<p> A list of assertion statements for session-level evaluation. Each assertion describes an expected behavior or outcome the agent should demonstrate during the session. </p>"
1770+
},
1771+
"expectedTrajectory":{
1772+
"shape":"EvaluationExpectedTrajectory",
1773+
"documentation":"<p> The expected tool call sequence for session-level trajectory evaluation. Contains a list of tool names representing the tools the agent is expected to invoke. </p>"
1774+
}
1775+
},
1776+
"documentation":"<p> A reference input containing ground truth data for evaluation, scoped to a specific context level (session or trace) through its span context. </p>"
1777+
},
1778+
"EvaluationReferenceInputs":{
1779+
"type":"list",
1780+
"member":{"shape":"EvaluationReferenceInput"},
1781+
"max":1000,
1782+
"min":1,
1783+
"sensitive":true
1784+
},
17221785
"EvaluationResultContent":{
17231786
"type":"structure",
17241787
"required":[
@@ -1767,6 +1830,10 @@
17671830
"errorCode":{
17681831
"shape":"EvaluationErrorCode",
17691832
"documentation":"<p> The error code indicating the type of failure that occurred during evaluation. Used to programmatically identify and handle different categories of evaluation errors. </p>"
1833+
},
1834+
"ignoredReferenceInputFields":{
1835+
"shape":"IgnoredReferenceInputFields",
1836+
"documentation":"<p> The list of reference input field names that were provided but not used by the evaluator. Helps identify which ground truth data was not consumed during evaluation. </p>"
17701837
}
17711838
},
17721839
"documentation":"<p> The comprehensive result of an evaluation containing the score, explanation, evaluator metadata, and execution details. Provides both quantitative ratings and qualitative insights about agent performance. </p>"
@@ -1790,6 +1857,17 @@
17901857
"documentation":"<p> The specification of which trace or span IDs to evaluate within the provided input data. Allows precise targeting of evaluation at different levels: tool calls, traces, or sessions. </p>",
17911858
"union":true
17921859
},
1860+
"EvaluationToolName":{
1861+
"type":"string",
1862+
"max":500,
1863+
"min":1
1864+
},
1865+
"EvaluationToolNames":{
1866+
"type":"list",
1867+
"member":{"shape":"EvaluationToolName"},
1868+
"max":1000,
1869+
"min":0
1870+
},
17931871
"EvaluatorArn":{
17941872
"type":"string",
17951873
"pattern":"arn:aws:bedrock-agentcore:[a-z0-9-]+:[0-9]{12}:evaluator\\/[a-zA-Z][a-zA-Z0-9-_]{0,99}-[a-zA-Z0-9]{10}$|^arn:aws:bedrock-agentcore:::evaluator/Builtin.[a-zA-Z0-9_-]+"
@@ -2469,6 +2547,17 @@
24692547
"type":"integer",
24702548
"box":true
24712549
},
2550+
"IgnoredReferenceInputField":{
2551+
"type":"string",
2552+
"max":1000,
2553+
"min":1
2554+
},
2555+
"IgnoredReferenceInputFields":{
2556+
"type":"list",
2557+
"member":{"shape":"IgnoredReferenceInputField"},
2558+
"max":100,
2559+
"min":0
2560+
},
24722561
"InputContentBlock":{
24732562
"type":"structure",
24742563
"required":["path"],
@@ -2540,7 +2629,7 @@
25402629
},
25412630
"runtimeSessionId":{
25422631
"shape":"SessionType",
2543-
"documentation":"<p>Runtime session identifier</p>",
2632+
"documentation":"<p>The unique identifier of the runtime session in which to execute the command. This session ID is used to maintain state and context across multiple command invocations.</p>",
25442633
"idempotencyToken":true,
25452634
"location":"header",
25462635
"locationName":"X-Amzn-Bedrock-AgentCore-Runtime-Session-Id"
@@ -2571,28 +2660,28 @@
25712660
},
25722661
"agentRuntimeArn":{
25732662
"shape":"String",
2574-
"documentation":"<p>ARN of the agent runtime</p>",
2663+
"documentation":"<p>The Amazon Resource Name (ARN) of the agent runtime on which to execute the command. This identifies the specific agent runtime environment where the command will run.</p>",
25752664
"location":"uri",
25762665
"locationName":"agentRuntimeArn"
25772666
},
25782667
"qualifier":{
25792668
"shape":"String",
2580-
"documentation":"<p>Version or alias qualifier</p>",
2669+
"documentation":"<p>The qualifier to use for the agent runtime. This is an endpoint name that points to a specific version. If not specified, Amazon Bedrock AgentCore uses the default endpoint of the agent runtime.</p>",
25812670
"location":"querystring",
25822671
"locationName":"qualifier"
25832672
},
25842673
"accountId":{
25852674
"shape":"InvokeAgentRuntimeCommandRequestAccountIdString",
2586-
"documentation":"<p>Account ID (12 digits)</p>",
2675+
"documentation":"<p>The identifier of the Amazon Web Services account for the agent runtime resource. This parameter is required when you specify an agent ID instead of the full ARN for <code>agentRuntimeArn</code>.</p>",
25872676
"location":"querystring",
25882677
"locationName":"accountId"
25892678
},
25902679
"body":{
25912680
"shape":"InvokeAgentRuntimeCommandRequestBody",
2592-
"documentation":"<p>Request body containing command and timeout</p>"
2681+
"documentation":"<p>The request body containing the command to execute and optional configuration parameters such as timeout settings.</p>"
25932682
}
25942683
},
2595-
"documentation":"<p>Request for InvokeAgentRuntimeCommand operation</p>",
2684+
"documentation":"<p>Request for InvokeAgentRuntimeCommand operation.</p>",
25962685
"payload":"body"
25972686
},
25982687
"InvokeAgentRuntimeCommandRequestAccountIdString":{
@@ -2610,14 +2699,14 @@
26102699
"members":{
26112700
"command":{
26122701
"shape":"InvokeAgentRuntimeCommandRequestBodyCommandString",
2613-
"documentation":"<p>The command to execute in the runtime container</p>"
2702+
"documentation":"<p>The shell command to execute on the agent runtime. This command is executed in the runtime environment and its output is streamed back to the caller.</p>"
26142703
},
26152704
"timeout":{
26162705
"shape":"Integer",
2617-
"documentation":"<p>Command timeout in seconds (default: 300, min:1, max: 3600)</p>"
2706+
"documentation":"<p>The maximum duration in seconds to wait for the command to complete. If the command execution exceeds this timeout, it will be terminated. Default is 300 seconds. Minimum is 1 second. Maximum is 3600 seconds.</p>"
26182707
}
26192708
},
2620-
"documentation":"<p>Request body for InvokeAgentRuntimeCommand</p>"
2709+
"documentation":"<p>The request body structure for the <code>InvokeAgentRuntimeCommand</code> operation, containing the command to execute and optional configuration parameters.</p>"
26212710
},
26222711
"InvokeAgentRuntimeCommandRequestBodyCommandString":{
26232712
"type":"string",
@@ -2648,7 +2737,7 @@
26482737
"members":{
26492738
"runtimeSessionId":{
26502739
"shape":"SessionId",
2651-
"documentation":"<p>Runtime session identifier</p>",
2740+
"documentation":"<p>The unique identifier of the runtime session in which the command was executed.</p>",
26522741
"location":"header",
26532742
"locationName":"X-Amzn-Bedrock-AgentCore-Runtime-Session-Id"
26542743
},
@@ -2689,22 +2778,22 @@
26892778
},
26902779
"stream":{
26912780
"shape":"InvokeAgentRuntimeCommandStreamOutput",
2692-
"documentation":"<p>Streaming output containing command execution events</p>"
2781+
"documentation":"<p>The streaming output from the command execution. This stream contains events that provide real-time updates including standard output, standard error, and completion status.</p>"
26932782
}
26942783
},
2695-
"documentation":"<p>Response for InvokeAgentRuntimeCommand operation</p>",
2784+
"documentation":"<p>Response for InvokeAgentRuntimeCommand operation.</p>",
26962785
"payload":"stream"
26972786
},
26982787
"InvokeAgentRuntimeCommandStreamOutput":{
26992788
"type":"structure",
27002789
"members":{
27012790
"chunk":{
27022791
"shape":"ResponseChunk",
2703-
"documentation":"<p>Response chunk containing command execution events</p>"
2792+
"documentation":"<p>A response chunk containing command execution events such as content start, content delta, or content stop events.</p>"
27042793
},
27052794
"accessDeniedException":{
27062795
"shape":"AccessDeniedException",
2707-
"documentation":"<p>Exception events for error streaming</p>"
2796+
"documentation":"<p>Exception events for error streaming.</p>"
27082797
},
27092798
"internalServerException":{"shape":"InternalServerException"},
27102799
"resourceNotFoundException":{"shape":"ResourceNotFoundException"},
@@ -2713,7 +2802,7 @@
27132802
"validationException":{"shape":"ValidationException"},
27142803
"runtimeClientError":{"shape":"RuntimeClientError"}
27152804
},
2716-
"documentation":"<p>Streaming output for InvokeAgentRuntimeCommand operation Delivers typed events: contentStart (first), contentDelta (middle), contentStop (last)</p>",
2805+
"documentation":"<p>The streaming output union for the <code>InvokeAgentRuntimeCommand</code> operation. This union delivers typed events: <code>contentStart</code> (first), <code>contentDelta</code> (middle), and <code>contentStop</code> (last).</p>",
27172806
"eventstream":true
27182807
},
27192808
"InvokeAgentRuntimeRequest":{
@@ -3841,18 +3930,18 @@
38413930
"members":{
38423931
"contentStart":{
38433932
"shape":"ContentStartEvent",
3844-
"documentation":"<p>First chunk - indicates command execution has started</p>"
3933+
"documentation":"<p>An event indicating the start of content streaming from the command execution. This is the first chunk received.</p>"
38453934
},
38463935
"contentDelta":{
38473936
"shape":"ContentDeltaEvent",
3848-
"documentation":"<p>Middle chunks - stdout/stderr output</p>"
3937+
"documentation":"<p>An event containing incremental output (stdout or stderr) from the command execution. These are the middle chunks.</p>"
38493938
},
38503939
"contentStop":{
38513940
"shape":"ContentStopEvent",
3852-
"documentation":"<p>Last chunk - indicates command execution has completed</p>"
3941+
"documentation":"<p>An event indicating the completion of the command execution, including the exit code and final status. This is the last chunk received.</p>"
38533942
}
38543943
},
3855-
"documentation":"<p>Response chunk containing exactly one of: contentStart, contentDelta, or contentStop</p>",
3944+
"documentation":"<p>A structure representing a response chunk that contains exactly one of the possible event types: <code>contentStart</code>, <code>contentDelta</code>, or <code>contentStop</code>.</p>",
38563945
"event":true
38573946
},
38583947
"ResponseStream":{

0 commit comments

Comments
 (0)