Amazon Bedrock AgentCore Update: Adds Ground Truth support for AgentCore Evaluations (Evaluate)

AWS · AWS · commit e985da2d52fb · 2026-03-30T18:18:13.000Z
diff --git a/.changes/next-release/feature-AmazonBedrockAgentCore-cc0c463.json b/.changes/next-release/feature-AmazonBedrockAgentCore-cc0c463.json
@@ -0,0 +1,6 @@
+{
+    "type": "feature",
+    "category": "Amazon Bedrock AgentCore",
+    "contributor": "",
+    "description": "Adds Ground Truth support for AgentCore Evaluations (Evaluate)"
+}
diff --git a/services/bedrockagentcore/src/main/resources/codegen-resources/service-2.json b/services/bedrockagentcore/src/main/resources/codegen-resources/service-2.json
@@ -409,7 +409,7 @@
         {"shape":"ResourceNotFoundException"},
         {"shape":"InternalServerException"}
       ],
-      "documentation":"<p>Executes a command in a runtime session container. Returns streaming output with contentStart, contentDelta, and contentStop events.</p>"
+      "documentation":"<p>Executes a command in a runtime session container and streams the output back to the caller. This operation allows you to run shell commands within the agent runtime environment and receive real-time streaming responses including standard output and standard error.</p> <p>To invoke a command, you must specify the agent runtime ARN and a runtime session ID. The command execution supports streaming responses, allowing you to receive output as it becomes available through <code>contentStart</code>, <code>contentDelta</code>, and <code>contentStop</code> events.</p> <p>To use this operation, you must have the <code>bedrock-agentcore:InvokeAgentRuntimeCommand</code> permission.</p>"
     },
     "InvokeCodeInterpreter":{
       "name":"InvokeCodeInterpreter",
@@ -1399,19 +1399,19 @@
       "members":{
         "stdout":{
           "shape":"String",
-          "documentation":"<p>Standard output content</p>"
+          "documentation":"<p>The standard output content from the command execution. This field contains the incremental output written to stdout by the executing command.</p>"
         },
         "stderr":{
           "shape":"String",
-          "documentation":"<p>Standard error content</p>"
+          "documentation":"<p>The standard error content from the command execution. This field contains the incremental output written to stderr by the executing command.</p>"
         }
       },
-      "documentation":"<p>Content event containing stdout or stderr output</p>"
+      "documentation":"<p>An event that contains incremental output from a command execution. This event streams standard output and standard error content as it becomes available during command execution.</p>"
     },
     "ContentStartEvent":{
       "type":"structure",
       "members":{},
-      "documentation":"<p>First event indicating command execution has started</p>"
+      "documentation":"<p>An event that signals the start of content streaming from a command execution. This event is sent when the command begins producing output.</p>"
     },
     "ContentStopEvent":{
       "type":"structure",
@@ -1422,14 +1422,14 @@
       "members":{
         "exitCode":{
           "shape":"Integer",
-          "documentation":"<p>Exit code: 0 = success, -1 = platform error, &gt;0 = command error</p>"
+          "documentation":"<p>The exit code returned by the executed command. An exit code of 0 indicates successful execution, -1 indicates a platform error, and values greater than 0 indicate command-specific errors.</p>"
         },
         "status":{
           "shape":"CommandExecutionStatus",
-          "documentation":"<p>Execution status</p>"
+          "documentation":"<p>The final status of the command execution. Valid values are <code>COMPLETED</code> for successful completion or <code>TIMED_OUT</code> if the command exceeded the specified timeout.</p>"
         }
       },
-      "documentation":"<p>Final event indicating command execution has completed</p>"
+      "documentation":"<p>An event that signals the completion of a command execution. This event contains the final status and exit code of the executed command.</p>"
     },
     "ContentTextString":{
       "type":"string",
@@ -1679,6 +1679,10 @@
         "evaluationTarget":{
           "shape":"EvaluationTarget",
           "documentation":"<p> The specific trace or span IDs to evaluate within the provided input. Allows targeting evaluation at different levels: individual tool calls, single request-response interactions (traces), or entire conversation sessions. </p>"
+        },
+        "evaluationReferenceInputs":{
+          "shape":"EvaluationReferenceInputs",
+          "documentation":"<p> Ground truth data to compare against agent responses during evaluation. Allows to provide expected responses, assertions, and expected tool trajectories at different evaluation levels. Session-level reference inputs apply to the entire conversation, while trace-level reference inputs target specific request-response interactions identified by trace ID. </p>"
         }
       }
     },
@@ -1692,6 +1696,28 @@
         }
       }
     },
+    "EvaluationContent":{
+      "type":"structure",
+      "members":{
+        "text":{
+          "shape":"EvaluationContentTextString",
+          "documentation":"<p> The text content of the ground truth data. Used for expected response text and assertion statements. </p>"
+        }
+      },
+      "documentation":"<p> A content block for ground truth data in evaluation reference inputs. Supports text content for expected responses and assertions. </p>",
+      "union":true
+    },
+    "EvaluationContentList":{
+      "type":"list",
+      "member":{"shape":"EvaluationContent"},
+      "max":100,
+      "min":1
+    },
+    "EvaluationContentTextString":{
+      "type":"string",
+      "max":100000,
+      "min":1
+    },
     "EvaluationErrorCode":{
       "type":"string",
       "max":1024,
@@ -1702,6 +1728,16 @@
       "max":2048,
       "min":0
     },
+    "EvaluationExpectedTrajectory":{
+      "type":"structure",
+      "members":{
+        "toolNames":{
+          "shape":"EvaluationToolNames",
+          "documentation":"<p> The list of tool names representing the expected tool call sequence. </p>"
+        }
+      },
+      "documentation":"<p> The expected tool call trajectory for trajectory-based evaluation. </p>"
+    },
     "EvaluationExplanation":{
       "type":"string",
       "max":2048,
@@ -1719,6 +1755,33 @@
       "documentation":"<p> The input data structure containing agent session spans in OpenTelemetry format. Supports traces from frameworks like Strands (AgentCore Runtime) and LangGraph with OpenInference instrumentation for comprehensive evaluation. </p>",
       "union":true
     },
+    "EvaluationReferenceInput":{
+      "type":"structure",
+      "required":["context"],
+      "members":{
+        "context":{"shape":"Context"},
+        "expectedResponse":{
+          "shape":"EvaluationContent",
+          "documentation":"<p> The expected response for trace-level evaluation. Built-in evaluators that support this field compare the agent's actual response against this value for assessment. Custom evaluators can access it through the <code>{expected_response}</code> placeholder in their instructions. </p>"
+        },
+        "assertions":{
+          "shape":"EvaluationContentList",
+          "documentation":"<p> A list of assertion statements for session-level evaluation. Each assertion describes an expected behavior or outcome the agent should demonstrate during the session. </p>"
+        },
+        "expectedTrajectory":{
+          "shape":"EvaluationExpectedTrajectory",
+          "documentation":"<p> The expected tool call sequence for session-level trajectory evaluation. Contains a list of tool names representing the tools the agent is expected to invoke. </p>"
+        }
+      },
+      "documentation":"<p> A reference input containing ground truth data for evaluation, scoped to a specific context level (session or trace) through its span context. </p>"
+    },
+    "EvaluationReferenceInputs":{
+      "type":"list",
+      "member":{"shape":"EvaluationReferenceInput"},
+      "max":1000,
+      "min":1,
+      "sensitive":true
+    },
     "EvaluationResultContent":{
       "type":"structure",
       "required":[
@@ -1767,6 +1830,10 @@
         "errorCode":{
           "shape":"EvaluationErrorCode",
           "documentation":"<p> The error code indicating the type of failure that occurred during evaluation. Used to programmatically identify and handle different categories of evaluation errors. </p>"
+        },
+        "ignoredReferenceInputFields":{
+          "shape":"IgnoredReferenceInputFields",
+          "documentation":"<p> The list of reference input field names that were provided but not used by the evaluator. Helps identify which ground truth data was not consumed during evaluation. </p>"
         }
       },
       "documentation":"<p> The comprehensive result of an evaluation containing the score, explanation, evaluator metadata, and execution details. Provides both quantitative ratings and qualitative insights about agent performance. </p>"
@@ -1790,6 +1857,17 @@
       "documentation":"<p> The specification of which trace or span IDs to evaluate within the provided input data. Allows precise targeting of evaluation at different levels: tool calls, traces, or sessions. </p>",
       "union":true
     },
+    "EvaluationToolName":{
+      "type":"string",
+      "max":500,
+      "min":1
+    },
+    "EvaluationToolNames":{
+      "type":"list",
+      "member":{"shape":"EvaluationToolName"},
+      "max":1000,
+      "min":0
+    },
     "EvaluatorArn":{
       "type":"string",
       "pattern":"arn:aws:bedrock-agentcore:[a-z0-9-]+:[0-9]{12}:evaluator\\/[a-zA-Z][a-zA-Z0-9-_]{0,99}-[a-zA-Z0-9]{10}$|^arn:aws:bedrock-agentcore:::evaluator/Builtin.[a-zA-Z0-9_-]+"
@@ -2469,6 +2547,17 @@
       "type":"integer",
       "box":true
     },
+    "IgnoredReferenceInputField":{
+      "type":"string",
+      "max":1000,
+      "min":1
+    },
+    "IgnoredReferenceInputFields":{
+      "type":"list",
+      "member":{"shape":"IgnoredReferenceInputField"},
+      "max":100,
+      "min":0
+    },
     "InputContentBlock":{
       "type":"structure",
       "required":["path"],
@@ -2540,7 +2629,7 @@
         },
         "runtimeSessionId":{
           "shape":"SessionType",
-          "documentation":"<p>Runtime session identifier</p>",
+          "documentation":"<p>The unique identifier of the runtime session in which to execute the command. This session ID is used to maintain state and context across multiple command invocations.</p>",
           "idempotencyToken":true,
           "location":"header",
           "locationName":"X-Amzn-Bedrock-AgentCore-Runtime-Session-Id"
@@ -2571,28 +2660,28 @@
         },
         "agentRuntimeArn":{
           "shape":"String",
-          "documentation":"<p>ARN of the agent runtime</p>",
+          "documentation":"<p>The Amazon Resource Name (ARN) of the agent runtime on which to execute the command. This identifies the specific agent runtime environment where the command will run.</p>",
           "location":"uri",
           "locationName":"agentRuntimeArn"
         },
         "qualifier":{
           "shape":"String",
-          "documentation":"<p>Version or alias qualifier</p>",
+          "documentation":"<p>The qualifier to use for the agent runtime. This is an endpoint name that points to a specific version. If not specified, Amazon Bedrock AgentCore uses the default endpoint of the agent runtime.</p>",
           "location":"querystring",
           "locationName":"qualifier"
         },
         "accountId":{
           "shape":"InvokeAgentRuntimeCommandRequestAccountIdString",
-          "documentation":"<p>Account ID (12 digits)</p>",
+          "documentation":"<p>The identifier of the Amazon Web Services account for the agent runtime resource. This parameter is required when you specify an agent ID instead of the full ARN for <code>agentRuntimeArn</code>.</p>",
           "location":"querystring",
           "locationName":"accountId"
         },
         "body":{
           "shape":"InvokeAgentRuntimeCommandRequestBody",
-          "documentation":"<p>Request body containing command and timeout</p>"
+          "documentation":"<p>The request body containing the command to execute and optional configuration parameters such as timeout settings.</p>"
         }
       },
-      "documentation":"<p>Request for InvokeAgentRuntimeCommand operation</p>",
+      "documentation":"<p>Request for InvokeAgentRuntimeCommand operation.</p>",
       "payload":"body"
     },
     "InvokeAgentRuntimeCommandRequestAccountIdString":{
@@ -2610,14 +2699,14 @@
       "members":{
         "command":{
           "shape":"InvokeAgentRuntimeCommandRequestBodyCommandString",
-          "documentation":"<p>The command to execute in the runtime container</p>"
+          "documentation":"<p>The shell command to execute on the agent runtime. This command is executed in the runtime environment and its output is streamed back to the caller.</p>"
         },
         "timeout":{
           "shape":"Integer",
-          "documentation":"<p>Command timeout in seconds (default: 300, min:1, max: 3600)</p>"
+          "documentation":"<p>The maximum duration in seconds to wait for the command to complete. If the command execution exceeds this timeout, it will be terminated. Default is 300 seconds. Minimum is 1 second. Maximum is 3600 seconds.</p>"
         }
       },
-      "documentation":"<p>Request body for InvokeAgentRuntimeCommand</p>"
+      "documentation":"<p>The request body structure for the <code>InvokeAgentRuntimeCommand</code> operation, containing the command to execute and optional configuration parameters.</p>"
     },
     "InvokeAgentRuntimeCommandRequestBodyCommandString":{
       "type":"string",
@@ -2648,7 +2737,7 @@
       "members":{
         "runtimeSessionId":{
           "shape":"SessionId",
-          "documentation":"<p>Runtime session identifier</p>",
+          "documentation":"<p>The unique identifier of the runtime session in which the command was executed.</p>",
           "location":"header",
           "locationName":"X-Amzn-Bedrock-AgentCore-Runtime-Session-Id"
         },
@@ -2689,22 +2778,22 @@
         },
         "stream":{
           "shape":"InvokeAgentRuntimeCommandStreamOutput",
-          "documentation":"<p>Streaming output containing command execution events</p>"
+          "documentation":"<p>The streaming output from the command execution. This stream contains events that provide real-time updates including standard output, standard error, and completion status.</p>"
         }
       },
-      "documentation":"<p>Response for InvokeAgentRuntimeCommand operation</p>",
+      "documentation":"<p>Response for InvokeAgentRuntimeCommand operation.</p>",
       "payload":"stream"
     },
     "InvokeAgentRuntimeCommandStreamOutput":{
       "type":"structure",
       "members":{
         "chunk":{
           "shape":"ResponseChunk",
-          "documentation":"<p>Response chunk containing command execution events</p>"
+          "documentation":"<p>A response chunk containing command execution events such as content start, content delta, or content stop events.</p>"
         },
         "accessDeniedException":{
           "shape":"AccessDeniedException",
-          "documentation":"<p>Exception events for error streaming</p>"
+          "documentation":"<p>Exception events for error streaming.</p>"
         },
         "internalServerException":{"shape":"InternalServerException"},
         "resourceNotFoundException":{"shape":"ResourceNotFoundException"},
@@ -2713,7 +2802,7 @@
         "validationException":{"shape":"ValidationException"},
         "runtimeClientError":{"shape":"RuntimeClientError"}
       },
-      "documentation":"<p>Streaming output for InvokeAgentRuntimeCommand operation Delivers typed events: contentStart (first), contentDelta (middle), contentStop (last)</p>",
+      "documentation":"<p>The streaming output union for the <code>InvokeAgentRuntimeCommand</code> operation. This union delivers typed events: <code>contentStart</code> (first), <code>contentDelta</code> (middle), and <code>contentStop</code> (last).</p>",
       "eventstream":true
     },
     "InvokeAgentRuntimeRequest":{
@@ -3841,18 +3930,18 @@
       "members":{
         "contentStart":{
           "shape":"ContentStartEvent",
-          "documentation":"<p>First chunk - indicates command execution has started</p>"
+          "documentation":"<p>An event indicating the start of content streaming from the command execution. This is the first chunk received.</p>"
         },
         "contentDelta":{
           "shape":"ContentDeltaEvent",
-          "documentation":"<p>Middle chunks - stdout/stderr output</p>"
+          "documentation":"<p>An event containing incremental output (stdout or stderr) from the command execution. These are the middle chunks.</p>"
         },
         "contentStop":{
           "shape":"ContentStopEvent",
-          "documentation":"<p>Last chunk - indicates command execution has completed</p>"
+          "documentation":"<p>An event indicating the completion of the command execution, including the exit code and final status. This is the last chunk received.</p>"
         }
       },
-      "documentation":"<p>Response chunk containing exactly one of: contentStart, contentDelta, or contentStop</p>",
+      "documentation":"<p>A structure representing a response chunk that contains exactly one of the possible event types: <code>contentStart</code>, <code>contentDelta</code>, or <code>contentStop</code>.</p>",
       "event":true
     },
     "ResponseStream":{