microsoft
diff --git a/‎.github/workflows/eval.yml‎
Lines changed: 2 additions & 7 deletions b/‎.github/workflows/eval.yml‎
Lines changed: 2 additions & 7 deletions
diff --git a/‎.gitignore‎
Lines changed: 3 additions & 0 deletions b/‎.gitignore‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎.vally.yaml‎
Lines changed: 3 additions & 3 deletions b/‎.vally.yaml‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎evals/README.md‎
Lines changed: 62 additions & 0 deletions b/‎evals/README.md‎
Lines changed: 62 additions & 0 deletions
diff --git a/‎evals/azure-deploy/eval.yaml‎
Lines changed: 0 additions & 2 deletions b/‎evals/azure-deploy/eval.yaml‎
Lines changed: 0 additions & 2 deletions
diff --git a/‎evals/azure-enterprise-infra-planner/eval.yaml‎
Lines changed: 0 additions & 12 deletions b/‎evals/azure-enterprise-infra-planner/eval.yaml‎
Lines changed: 0 additions & 12 deletions
diff --git a/‎evals/azure-hosted-copilot-sdk/eval.yaml‎
Lines changed: 3 additions & 36 deletions b/‎evals/azure-hosted-copilot-sdk/eval.yaml‎
Lines changed: 3 additions & 36 deletions
diff --git a/‎evals/azure-hosted-copilot-sdk/tasks/byom-config.yaml‎
Lines changed: 0 additions & 29 deletions b/‎evals/azure-hosted-copilot-sdk/tasks/byom-config.yaml‎
Lines changed: 0 additions & 29 deletions
@@ -1,10 +1,5 @@
 name: Run Skill Evaluations
 on:
-  pull_request:
-    branches: [main]
-    paths:
-      - 'evals/**'
-      - 'plugin/skills/**'
   workflow_dispatch:
 
 permissions:
@@ -23,8 +18,8 @@ jobs:
           node-version: '22'
           registry-url: https://npm.pkg.github.com
           scope: '@microsoft'
-      - name: Install dependencies
-        run: npm install --no-save
+      - name: Install vally-cli
+        run: npm install --no-save @microsoft/vally-cli
         env:
           NODE_AUTH_TOKEN: ${{ secrets.VALLY_NPM_TOKEN }}
       - name: Run evaluations
 
@@ -347,3 +347,6 @@ x86/
 dashboard/.azure/
 dashboard/dist/
 dashboard/**/dist/
+
+# Local vally eval outputs
+results/
@@ -7,13 +7,13 @@ paths:
 
 suites:
   smoke:
-    description: "Fast static checks — no LLM calls, <60s"
+    description: "Static non-LLM checks only (e.g., trigger-pattern tests). Currently empty — all evals use the copilot-sdk executor."
     filter:
       tier: smoke
       cost: free
 
   pr:
-    description: "All free-tier evals for PR gate, <2min"
+    description: "Non-LLM PR gate evals (cost: free reserved for static checks). Currently empty — populate as static evals are added."
     filter:
       cost: free
 
@@ -23,7 +23,7 @@ suites:
       type: trigger
 
   integration:
-    description: "All behavior/integration evals"
+    description: "All behavior/integration evals (LLM-backed)"
     filter:
       type: integration
 
 
@@ -0,0 +1,62 @@
+# Evals
+
+Skill evaluation suites run by [Vally](https://github.com/microsoft/ai-bench) (`@microsoft/vally-cli`). Each subdirectory corresponds to a skill and contains an `eval.yaml` defining stimuli, graders, and configuration.
+
+## Prerequisites
+
+`@microsoft/vally-cli` is published to GitHub Packages. You need a GitHub **Personal Access Token** with the `read:packages` scope.
+
+1. Create a PAT: <https://github.com/settings/tokens> (classic) → enable `read:packages`.
+2. Configure npm to use GitHub Packages for the `@microsoft` scope. Create or update `~/.npmrc`:
+
+   ```ini
+   @microsoft:registry=https://npm.pkg.github.com
+   //npm.pkg.github.com/:_authToken=${GITHUB_PACKAGES_TOKEN}
+   ```
+
+3. Export your token:
+
+   ```bash
+   export GITHUB_PACKAGES_TOKEN=ghp_xxxxxxxxxxxx
+   ```
+
+4. Install the CLI (either globally, or invoke with `npx`):
+
+   ```bash
+   npm install -g @microsoft/vally-cli
+   # or, no install: use `npx @microsoft/vally-cli ...` below
+   ```
+
+You will also need a `GITHUB_TOKEN` (Copilot-enabled) in your environment for the `copilot-sdk` executor used by most evals.
+
+## Running a single eval spec
+
+From the repo root:
+
+```bash
+npx @microsoft/vally-cli eval \
+  --eval-spec evals/azure-hosted-copilot-sdk/eval.yaml \
+  --output-dir ./results \
+  --output jsonl
+```
+
+## Running a suite
+
+Suites are defined in [`.vally.yaml`](../.vally.yaml) at the repo root and filter across all `evals/**/eval.yaml` files.
+
+```bash
+npx @microsoft/vally-cli eval --suite pr
+npx @microsoft/vally-cli eval --suite full
+```
+
+## Viewing results
+
+After a run, check the output directory (default `./results`):
+
+- `results.jsonl` — one JSON record per stimulus/run with grader outcomes.
+- `eval-results.md` — human-readable summary.
+
+## More info
+
+- Vally docs & source: <https://github.com/microsoft/ai-bench>
+- Suite definitions: [`.vally.yaml`](../.vally.yaml)
@@ -42,7 +42,6 @@ stimuli:
     tags:
       type: integration
       tier: full
-      cost: free
       area: output
     graders:
       # Task: expected.output_contains
@@ -80,7 +79,6 @@ stimuli:
     tags:
       type: integration
       tier: full
-      cost: free
       area: output
     graders:
       # Task: expected.output_contains
 
@@ -42,7 +42,6 @@ stimuli:
     tags:
       type: integration
       tier: smoke
-      cost: free
       area: files
     constraints:
       max_turns: 50
@@ -78,7 +77,6 @@ stimuli:
     tags:
       type: integration
       tier: full
-      cost: free
       area: behavior
     constraints:
       max_turns: 50
@@ -110,7 +108,6 @@ stimuli:
     tags:
       type: integration
       tier: full
-      cost: free
       area: [output, files]
     constraints:
       max_turns: 50
@@ -175,7 +172,6 @@ stimuli:
     tags:
       type: integration
       tier: full
-      cost: free
       area: [output, files]
     constraints:
       max_turns: 50
@@ -239,7 +235,6 @@ stimuli:
     tags:
       type: integration
       tier: full
-      cost: free
       area: behavior
     constraints:
       max_turns: 50
@@ -270,7 +265,6 @@ stimuli:
     tags:
       type: integration
       tier: full
-      cost: free
       area: behavior
     constraints:
       max_turns: 50
@@ -302,7 +296,6 @@ stimuli:
     tags:
       type: integration
       tier: full
-      cost: free
       area: [output, files]
     constraints:
       max_turns: 50
@@ -367,7 +360,6 @@ stimuli:
     tags:
       type: integration
       tier: full
-      cost: free
       area: [output, files]
     constraints:
       max_turns: 50
@@ -432,7 +424,6 @@ stimuli:
     tags:
       type: integration
       tier: full
-      cost: free
       area: [output, files]
     constraints:
       max_turns: 50
@@ -497,7 +488,6 @@ stimuli:
     tags:
       type: integration
       tier: full
-      cost: free
       area: [output, files]
     constraints:
       max_turns: 50
@@ -561,7 +551,6 @@ stimuli:
     tags:
       type: integration
       tier: full
-      cost: free
       area: behavior
     constraints:
       max_turns: 50
@@ -593,7 +582,6 @@ stimuli:
     tags:
       type: integration
       tier: full
-      cost: free
       area: [output, files]
     constraints:
       max_turns: 50
 
@@ -19,6 +19,9 @@ tags:
 environment:
   skills:
     - ../../plugin/skills/azure-hosted-copilot-sdk
+    - ../../plugin/skills/azure-prepare
+    - ../../plugin/skills/azure-validate
+    - ../../plugin/skills/azure-deploy
 
 config:
   runs: 1
@@ -38,7 +41,6 @@ stimuli:
     tags:
       type: integration
       tier: full
-      cost: free
       area: output
     constraints:
       max_turns: 10
@@ -48,12 +50,6 @@ stimuli:
         config:
           substring: "azd init --template azure-samples/copilot-sdk-service"
       # Task: expected.output_not_contains
-      - type: output-not-contains
-        config:
-          substring: "error"
-      - type: output-not-contains
-        config:
-          substring: "failed"
       # Global: no_fatal_errors
       - type: output-not-matches
         config:
@@ -67,7 +63,6 @@ stimuli:
     tags:
       type: integration
       tier: full
-      cost: free
       area: output
     constraints:
       max_turns: 10
@@ -77,12 +72,6 @@ stimuli:
         config:
           substring: "azure.yaml"
       # Task: expected.output_not_contains
-      - type: output-not-contains
-        config:
-          substring: "error"
-      - type: output-not-contains
-        config:
-          substring: "failed"
       # Global: no_fatal_errors
       - type: output-not-matches
         config:
@@ -96,7 +85,6 @@ stimuli:
     tags:
       type: integration
       tier: full
-      cost: free
       area: output
     constraints:
       max_turns: 15
@@ -112,12 +100,6 @@ stimuli:
         config:
           substring: "azure-deploy"
       # Task: expected.output_not_contains
-      - type: output-not-contains
-        config:
-          substring: "error"
-      - type: output-not-contains
-        config:
-          substring: "failed"
       # Global: no_fatal_errors
       - type: output-not-matches
         config:
@@ -131,7 +113,6 @@ stimuli:
     tags:
       type: integration
       tier: full
-      cost: free
       area: output
     constraints:
       max_turns: 10
@@ -150,12 +131,6 @@ stimuli:
       - type: output-not-contains
         config:
           substring: "DefaultAzureCredential"
-      - type: output-not-contains
-        config:
-          substring: "error"
-      - type: output-not-contains
-        config:
-          substring: "failed"
       # Global: no_fatal_errors
       - type: output-not-matches
         config:
@@ -169,7 +144,6 @@ stimuli:
     tags:
       type: integration
       tier: full
-      cost: free
       area: output
     constraints:
       max_turns: 10
@@ -185,12 +159,6 @@ stimuli:
         config:
           substring: "DefaultAzureCredential"
       # Task: expected.output_not_contains
-      - type: output-not-contains
-        config:
-          substring: "error"
-      - type: output-not-contains
-        config:
-          substring: "failed"
       - type: output-not-contains
         config:
           substring: "apiKey"
@@ -213,7 +181,6 @@ stimuli:
     tags:
       type: trigger
       tier: smoke
-      cost: free
       area: routing
     constraints:
       max_turns: 10