feat: rename run evals command to run eval (#636)

notgitika · web-flow · commit 408801af36ca · 2026-03-25T14:38:11.000-04:00
* feat: rename `run evals` command to `run eval`

* chore: fix formatting in README.md
diff --git a/AGENTS.md b/AGENTS.md
@@ -30,7 +30,7 @@ Note: CDK L3 constructs are in a separate package `@aws/agentcore-cdk`.
 - `status` - Check deployment status
 - `dev` - Local development server (CodeZip: uvicorn with hot-reload; Container: Docker build + run with volume mount)
 - `invoke` - Invoke agents (local or deployed)
-- `run evals` - Run on-demand evaluation against agent sessions
+- `run eval` - Run on-demand evaluation against agent sessions
 - `evals history` - View past eval run results
 - `pause online-eval` - Pause (disable) a deployed online eval config
 - `resume online-eval` - Resume (enable) a paused online eval config
diff --git a/README.md b/README.md
@@ -98,7 +98,7 @@ agentcore invoke
 | -------------------- | --------------------------------------------- |
 | `add evaluator`      | Add a custom LLM-as-a-Judge evaluator         |
 | `add online-eval`    | Add continuous evaluation for live traffic    |
-| `run evals`          | Run on-demand evaluation against agent traces |
+| `run eval`           | Run on-demand evaluation against agent traces |
 | `evals history`      | View past eval run results                    |
 | `pause online-eval`  | Pause a deployed online eval config           |
 | `resume online-eval` | Resume a paused online eval config            |
diff --git a/docs/commands.md b/docs/commands.md
@@ -586,16 +586,16 @@ agentcore traces get abc123 --agent MyAgent --output ./trace.json
 
 See [Evaluations](evals.md) for the full guide on evaluators, scoring, and online monitoring.
 
-### run evals
+### run eval
 
 Run on-demand evaluation against historical agent traces.
 
 ```bash
 # Project mode
-agentcore run evals --agent MyAgent --evaluator ResponseQuality --days 7
+agentcore run eval --agent MyAgent --evaluator ResponseQuality --days 7
 
 # Standalone mode (no project required)
-agentcore run evals \
+agentcore run eval \
   --agent-arn arn:aws:...:runtime/abc123 \
   --evaluator-arn arn:aws:...:evaluator/eval123 \
   --region us-east-1
diff --git a/docs/evals.md b/docs/evals.md
@@ -149,25 +149,25 @@ Run evaluators against historical agent traces.
 
 ```bash
 # Project mode — evaluate a project agent
-agentcore run evals \
+agentcore run eval \
   --agent MyAgent \
   --evaluator ResponseQuality \
   --days 7
 
 # Standalone mode — evaluate any agent by ARN
-agentcore run evals \
+agentcore run eval \
   --agent-arn arn:aws:bedrock-agentcore:us-east-1:123456789012:runtime/abc123 \
   --evaluator-arn arn:aws:bedrock-agentcore:us-east-1:123456789012:evaluator/eval123 \
   --region us-east-1
 
 # Multiple evaluators
-agentcore run evals \
+agentcore run eval \
   --agent MyAgent \
   --evaluator ResponseQuality Builtin.Faithfulness \
   --days 14
 
 # Target specific session or trace
-agentcore run evals \
+agentcore run eval \
   --agent MyAgent \
   --evaluator ResponseQuality \
   --session-id abc123 \
@@ -359,7 +359,7 @@ AgentCore provides pre-built evaluators that can be used without creating custom
 by their `Builtin.*` ID in `--evaluator` flags or in online eval config `evaluators` arrays.
 
 ```bash
-agentcore run evals --agent MyAgent --evaluator Builtin.Faithfulness
+agentcore run eval --agent MyAgent --evaluator Builtin.Faithfulness
 ```
 
 ---
@@ -369,8 +369,8 @@ agentcore run evals --agent MyAgent --evaluator Builtin.Faithfulness
 ### CI/CD Quality Gate
 
 ```bash
-# Run evals and fail pipeline if score < threshold
-result=$(agentcore run evals --agent MyAgent --evaluator ResponseQuality --days 1 --json)
+# Run eval and fail pipeline if score < threshold
+result=$(agentcore run eval --agent MyAgent --evaluator ResponseQuality --days 1 --json)
 score=$(echo "$result" | jq '.run.results[0].aggregateScore')
 if (( $(echo "$score < 0.7" | bc -l) )); then
   echo "Quality gate failed: score $score < 0.7"
@@ -389,7 +389,7 @@ agentcore add evaluator \
   --instructions "Evaluate the agent response quality. Context: {context}"
 
 # 2. Run on-demand eval to verify
-agentcore run evals --agent MyAgent --evaluator ResponseQuality --days 7
+agentcore run eval --agent MyAgent --evaluator ResponseQuality --days 7
 
 # 3. Set up continuous monitoring
 agentcore add online-eval \
@@ -407,7 +407,7 @@ agentcore deploy
 Evaluate agents and use evaluators outside of a project directory using ARNs:
 
 ```bash
-agentcore run evals \
+agentcore run eval \
   --agent-arn arn:aws:bedrock-agentcore:us-east-1:123456789012:runtime/my-agent \
   --evaluator-arn arn:aws:bedrock-agentcore:us-east-1:123456789012:evaluator/my-eval \
   --region us-east-1 \
diff --git a/e2e-tests/evals-lifecycle.test.ts b/e2e-tests/evals-lifecycle.test.ts
@@ -137,7 +137,7 @@ describe.sequential('e2e: evaluations lifecycle', () => {
         async () => {
           const result = await run([
             'run',
-            'evals',
+            'eval',
             '--agent',
             agentName,
             '--evaluator',
@@ -146,7 +146,7 @@ describe.sequential('e2e: evaluations lifecycle', () => {
             '1',
             '--json',
           ]);
-          expect(result.exitCode, `Run evals failed (stdout: ${result.stdout}, stderr: ${result.stderr})`).toBe(0);
+          expect(result.exitCode, `Run eval failed (stdout: ${result.stdout}, stderr: ${result.stderr})`).toBe(0);
           const json = parseJsonOutput(result.stdout) as Record<string, unknown>;
           expect(json).toHaveProperty('success', true);
           expect(json).toHaveProperty('run');
diff --git a/src/cli/commands/eval/command.tsx b/src/cli/commands/eval/command.tsx
@@ -39,7 +39,7 @@ export const registerEval = (program: Command) => {
 
         const runs = result.runs ?? [];
         if (runs.length === 0) {
-          console.log('No eval runs found. Run `agentcore run evals` to create one.');
+          console.log('No eval runs found. Run `agentcore run eval` to create one.');
           return;
         }
 
diff --git a/src/cli/commands/run/command.tsx b/src/cli/commands/run/command.tsx
@@ -36,7 +36,7 @@ export const registerRun = (program: Command) => {
   const runCmd = program.command('run').description(COMMAND_DESCRIPTIONS.run);
 
   runCmd
-    .command('evals')
+    .command('eval')
     .description('Run on-demand evaluation of agent traces. Use --agent-arn to evaluate agents outside the project.')
     .option('-a, --agent <name>', 'Agent name from project config')
     .option('--agent-arn <arn>', 'Agent runtime ARN — run outside a project directory')
diff --git a/src/cli/tui/screens/eval/EvalScreen.tsx b/src/cli/tui/screens/eval/EvalScreen.tsx
@@ -403,9 +403,9 @@ export function EvalScreen({ onExit }: EvalScreenProps) {
       {noRuns && (
         <Box flexDirection="column">
           <Text dimColor>No eval runs found.</Text>
-          <Text dimColor>Run `agentcore run evals` to evaluate a project agent,</Text>
+          <Text dimColor>Run `agentcore run eval` to evaluate a project agent,</Text>
           <Text dimColor>
-            or `agentcore run evals --agent-arn <Text bold>ARN</Text> --evaluator-arn <Text bold>ARN</Text>` for agents
+            or `agentcore run eval --agent-arn <Text bold>ARN</Text> --evaluator-arn <Text bold>ARN</Text>` for agents
             outside the project.
           </Text>
           {resultsDir && <Text dimColor>Results saved to: {resultsDir}</Text>}

Original file line number	Diff line number	Diff line change
`@@ -39,7 +39,7 @@ export const registerEval = (program: Command) => {`
`39`	`39`
`40`	`40`	`const runs = result.runs ?? [];`
`41`	`41`	`if (runs.length === 0) {`
`42`		- console.log('No eval runs found. Run `agentcore run evals` to create one.');
	`42`	+ console.log('No eval runs found. Run `agentcore run eval` to create one.');
`43`	`43`	`return;`
`44`	`44`	`}`
`45`	`45`