Rename eval suites to align with agentcontrol skill renames (#68)

ari-launchdarkly · web-flow · commit 0cffd4a5d6d8 · 2026-05-22T08:54:57.000-04:00
diff --git a/evals/README.md b/evals/README.md
@@ -28,17 +28,17 @@ Run these after any changes to the provider, mock, or shared utilities to catch
 # From evals/
 
 # Run a single suite (all test cases)
-npm run eval:aiconfig-create         # ai-configs/aiconfig-create
-npm run eval:aiconfig-update         # ai-configs/aiconfig-update
-npm run eval:aiconfig-tools          # ai-configs/aiconfig-tools
-npm run eval:aiconfig-variations     # ai-configs/aiconfig-variations
+npm run eval:configs-create           # agentcontrol/configs-create
+npm run eval:configs-update           # agentcontrol/configs-update
+npm run eval:agentcontrol-tools       # agentcontrol/tools
+npm run eval:configs-variations       # agentcontrol/configs-variations
 npm run eval:flag-create             # feature-flags/launchdarkly-flag-create
 
 # Quick smoke check — first test case only (~15-20s, ~$0.05)
-npm run eval:aiconfig-create:single
-npm run eval:aiconfig-update:single
-npm run eval:aiconfig-tools:single
-npm run eval:aiconfig-variations:single
+npm run eval:configs-create:single
+npm run eval:configs-update:single
+npm run eval:agentcontrol-tools:single
+npm run eval:configs-variations:single
 npm run eval:flag-create:single
 
 # Aggregate and CI operations
@@ -147,7 +147,7 @@ This handles agents that call `get-foo` before AND after mutation; using `indexO
 
 ### Cross-model evaluation (`run-models.js`)
 
-The cross-model runner evaluates all suites against one or more model aliases without touching the canonical `eval-scores.json`. Results are written to `<suite>/results.<alias>.json` (e.g., `aiconfig-create/results.haiku.json`).
+The cross-model runner evaluates all suites against one or more model aliases without touching the canonical `eval-scores.json`. Results are written to `<suite>/results.<alias>.json` (e.g., `configs-create/results.haiku.json`).
 
 ```bash
 npm run eval:haiku                   # claude-haiku-4-5-20251001
@@ -222,7 +222,7 @@ Read the SKILL.md and note every MCP tool it references. Verify each tool exists
 mkdir <skill-name>
 ```
 
-Use the same name as the skill directory (e.g., `aiconfig-create`). Create `promptfooconfig.yaml`:
+Use the same name as the skill directory (e.g., `configs-create`). Create `promptfooconfig.yaml`:
 
 ```yaml
 # yaml-language-server: $schema=https://promptfoo.dev/config-schema.json
@@ -264,7 +264,7 @@ Add an entry to `scripts/_manifest.js`:
 ```js
 {
   suite: "<skill-name>",
-  skillKey: "<domain>/<skill-name>",   // e.g. "ai-configs/aiconfig-create"
+  skillKey: "<domain>/<skill-name>",   // e.g. "agentcontrol/configs-create"
   skillDir: "skills/<domain>/<skill-name>",
   readme: "skills/<domain>/<skill-name>/README.md",
 },
@@ -364,7 +364,7 @@ Running `npm run eval:all` writes a summary at the repo root:
   "updatedAt": "2026-05-19T00:00:00Z",
   "lastCommit": "fc69376",
   "skills": {
-    "ai-configs/aiconfig-create": {
+    "agentcontrol/configs-create": {
       "score": 100,
       "passed": 4,
       "total": 4,
@@ -377,6 +377,6 @@ Running `npm run eval:all` writes a summary at the repo root:
 ```
 
 - `lastCommit` — the short git SHA at the time of the last `eval:all` run. Used by `eval:diff` to determine which suites have changed since scores were recorded.
-- `skillKey` — the canonical key is `<domain>/<skill-name>` (e.g., `ai-configs/aiconfig-create`).
+- `skillKey` — the canonical key is `<domain>/<skill-name>` (e.g., `agentcontrol/configs-create`).
 
 Run `node scripts/aggregate.js` (without `--run`) to rebuild this file from existing `<suite>/results.json` files without making any API calls.
diff --git a/evals/agentcontrol-tools/promptfooconfig.yaml b/evals/agentcontrol-tools/promptfooconfig.yaml
@@ -1,13 +1,13 @@
 # yaml-language-server: $schema=https://promptfoo.dev/config-schema.json
 #
 # Run with shared defaults:
-#   promptfoo eval -c shared/defaults.yaml -c aiconfig-tools/promptfooconfig.yaml
+#   promptfoo eval -c shared/defaults.yaml -c agentcontrol-tools/promptfooconfig.yaml
 #
-# The aiconfig-tools skill covers creating agent tool definitions and attaching
+# The agentcontrol-tools skill covers creating agent tool definitions and attaching
 # them to config variations. Key invariant: tools must be created with
 # raw JSON Schema format (not OpenAI function-calling wrapper), and must be
 # created before being attached.
-description: "End-to-end evaluation of the aiconfig-tools skill"
+description: "End-to-end evaluation of the agentcontrol-tools skill"
 
 prompts:
   - file://../../skills/agentcontrol/tools/SKILL.md
@@ -67,7 +67,7 @@ tests:
       - type: llm-rubric
         threshold: 0.75
         value: |
-          Evaluate the aiconfig-tools workflow:
+          Evaluate the agentcontrol-tools workflow:
           1. Did it create the tool first with create-ai-tool?
           2. Did the tool schema use raw JSON Schema format (type: object, properties)?
           3. Did the schema include both requested parameters (query, limit)?
diff --git a/evals/configs-create/promptfooconfig.yaml b/evals/configs-create/promptfooconfig.yaml
@@ -1,13 +1,13 @@
 # yaml-language-server: $schema=https://promptfoo.dev/config-schema.json
 #
 # Run with shared defaults:
-#   promptfoo eval -c shared/defaults.yaml -c aiconfig-create/promptfooconfig.yaml
+#   promptfoo eval -c shared/defaults.yaml -c configs-create/promptfooconfig.yaml
 #
-# The aiconfig-create skill guides the agent through choosing agent vs
+# The configs-create skill guides the agent through choosing agent vs
 # completion mode, creating a config + variation, and verifying the setup.
 # The recommended path is setup-ai-config (one-step); the agent may also
 # use create-ai-config + create-ai-config-variation for more control.
-description: "End-to-end evaluation of the aiconfig-create skill"
+description: "End-to-end evaluation of the configs-create skill"
 
 prompts:
   - file://../../skills/agentcontrol/configs-create/SKILL.md
@@ -179,9 +179,9 @@ tests:
   # ------------------------------------------------------------------
   # Test 4: Targeting reminder
   # After creation, the skill should remind the user they need to run
-  # targeting (/aiconfig-targeting) before the SDK returns enabled=True.
+  # targeting (/configs-targeting) before the SDK returns enabled=True.
   # ------------------------------------------------------------------
-  - description: "Post-creation: response mentions targeting step or aiconfig-targeting"
+  - description: "Post-creation: response mentions targeting step or configs-targeting"
     vars:
       user_request: >
         Create a config called "email-assistant" for composing emails.
@@ -202,7 +202,7 @@ tests:
           after creation. Score 1.0 if the response:
           1. Confirms the config was created successfully.
           2. Mentions that targeting must be configured before the SDK returns enabled=True,
-             OR references the /aiconfig-targeting skill, OR notes the fallthrough points at
+             OR references the /configs-targeting skill, OR notes the fallthrough points at
              a disabled variation.
           Score 0.5 if it mentions the config was created but omits the targeting warning.
           Score 0.0 if it neither confirms creation nor mentions targeting.
diff --git a/evals/configs-update/promptfooconfig.yaml b/evals/configs-update/promptfooconfig.yaml
@@ -1,11 +1,11 @@
 # yaml-language-server: $schema=https://promptfoo.dev/config-schema.json
 #
 # Run with shared defaults:
-#   promptfoo eval -c shared/defaults.yaml -c aiconfig-update/promptfooconfig.yaml
+#   promptfoo eval -c shared/defaults.yaml -c configs-update/promptfooconfig.yaml
 #
-# The aiconfig-update skill covers updating variation model/prompts/parameters,
+# The configs-update skill covers updating variation model/prompts/parameters,
 # updating config metadata, archiving instead of deleting, and verification.
-description: "End-to-end evaluation of the aiconfig-update skill"
+description: "End-to-end evaluation of the configs-update skill"
 
 prompts:
   - file://../../skills/agentcontrol/configs-update/SKILL.md
@@ -62,7 +62,7 @@ tests:
       - type: llm-rubric
         threshold: 0.7
         value: |
-          Evaluate the aiconfig-update workflow:
+          Evaluate the configs-update workflow:
           1. Did it explore current state (health or get-ai-config) before mutating?
           2. Did it use update-ai-config-variation to change the model?
           3. Did it use correct Provider.model-id format for modelConfigKey (e.g. OpenAI.gpt-4o-mini)?
diff --git a/evals/configs-variations/promptfooconfig.yaml b/evals/configs-variations/promptfooconfig.yaml
@@ -1,12 +1,12 @@
 # yaml-language-server: $schema=https://promptfoo.dev/config-schema.json
 #
 # Run with shared defaults:
-#   promptfoo eval -c shared/defaults.yaml -c aiconfig-variations/promptfooconfig.yaml
+#   promptfoo eval -c shared/defaults.yaml -c configs-variations/promptfooconfig.yaml
 #
-# The aiconfig-variations skill covers cloning a variation to test one change
+# The configs-variations skill covers cloning a variation to test one change
 # at a time (the primary path), creating from scratch (when explicitly asked),
 # and safety rules around not deleting the baseline variation.
-description: "End-to-end evaluation of the aiconfig-variations skill"
+description: "End-to-end evaluation of the configs-variations skill"
 
 prompts:
   - file://../../skills/agentcontrol/configs-variations/SKILL.md
diff --git a/evals/package.json b/evals/package.json
@@ -3,14 +3,14 @@
   "private": true,
   "type": "commonjs",
   "scripts": {
-    "eval:aiconfig-create": "promptfoo eval -c shared/defaults.yaml -c aiconfig-create/promptfooconfig.yaml --env-file .env --no-cache -o aiconfig-create/results.json",
-    "eval:aiconfig-create:single": "promptfoo eval -c shared/defaults.yaml -c aiconfig-create/promptfooconfig.yaml --env-file .env --no-cache --filter-first-n 1",
-    "eval:aiconfig-update": "promptfoo eval -c shared/defaults.yaml -c aiconfig-update/promptfooconfig.yaml --env-file .env --no-cache -o aiconfig-update/results.json",
-    "eval:aiconfig-update:single": "promptfoo eval -c shared/defaults.yaml -c aiconfig-update/promptfooconfig.yaml --env-file .env --no-cache --filter-first-n 1",
-    "eval:aiconfig-tools": "promptfoo eval -c shared/defaults.yaml -c aiconfig-tools/promptfooconfig.yaml --env-file .env --no-cache -o aiconfig-tools/results.json",
-    "eval:aiconfig-tools:single": "promptfoo eval -c shared/defaults.yaml -c aiconfig-tools/promptfooconfig.yaml --env-file .env --no-cache --filter-first-n 1",
-    "eval:aiconfig-variations": "promptfoo eval -c shared/defaults.yaml -c aiconfig-variations/promptfooconfig.yaml --env-file .env --no-cache -o aiconfig-variations/results.json",
-    "eval:aiconfig-variations:single": "promptfoo eval -c shared/defaults.yaml -c aiconfig-variations/promptfooconfig.yaml --env-file .env --no-cache --filter-first-n 1",
+    "eval:configs-create": "promptfoo eval -c shared/defaults.yaml -c configs-create/promptfooconfig.yaml --env-file .env --no-cache -o configs-create/results.json",
+    "eval:configs-create:single": "promptfoo eval -c shared/defaults.yaml -c configs-create/promptfooconfig.yaml --env-file .env --no-cache --filter-first-n 1",
+    "eval:configs-update": "promptfoo eval -c shared/defaults.yaml -c configs-update/promptfooconfig.yaml --env-file .env --no-cache -o configs-update/results.json",
+    "eval:configs-update:single": "promptfoo eval -c shared/defaults.yaml -c configs-update/promptfooconfig.yaml --env-file .env --no-cache --filter-first-n 1",
+    "eval:agentcontrol-tools": "promptfoo eval -c shared/defaults.yaml -c agentcontrol-tools/promptfooconfig.yaml --env-file .env --no-cache -o agentcontrol-tools/results.json",
+    "eval:agentcontrol-tools:single": "promptfoo eval -c shared/defaults.yaml -c agentcontrol-tools/promptfooconfig.yaml --env-file .env --no-cache --filter-first-n 1",
+    "eval:configs-variations": "promptfoo eval -c shared/defaults.yaml -c configs-variations/promptfooconfig.yaml --env-file .env --no-cache -o configs-variations/results.json",
+    "eval:configs-variations:single": "promptfoo eval -c shared/defaults.yaml -c configs-variations/promptfooconfig.yaml --env-file .env --no-cache --filter-first-n 1",
     "eval:flag-create": "promptfoo eval -c shared/defaults.yaml -c launchdarkly-flag-create/promptfooconfig.yaml --env-file .env --no-cache -o launchdarkly-flag-create/results.json",
     "eval:flag-create:single": "promptfoo eval -c shared/defaults.yaml -c launchdarkly-flag-create/promptfooconfig.yaml --env-file .env --no-cache --filter-first-n 1",
     "eval:all": "node scripts/aggregate.js --run",
diff --git a/evals/scripts/_manifest.js b/evals/scripts/_manifest.js
@@ -14,25 +14,25 @@
  */
 const SUITES = [
   {
-    suite: "aiconfig-create",
+    suite: "configs-create",
     skillKey: "agentcontrol/configs-create",
     skillDir: "skills/agentcontrol/configs-create",
     readme: "skills/agentcontrol/configs-create/README.md",
   },
   {
-    suite: "aiconfig-update",
+    suite: "configs-update",
     skillKey: "agentcontrol/configs-update",
     skillDir: "skills/agentcontrol/configs-update",
     readme: "skills/agentcontrol/configs-update/README.md",
   },
   {
-    suite: "aiconfig-tools",
+    suite: "agentcontrol-tools",
     skillKey: "agentcontrol/tools",
     skillDir: "skills/agentcontrol/tools",
     readme: "skills/agentcontrol/tools/README.md",
   },
   {
-    suite: "aiconfig-variations",
+    suite: "configs-variations",
     skillKey: "agentcontrol/configs-variations",
     skillDir: "skills/agentcontrol/configs-variations",
     readme: "skills/agentcontrol/configs-variations/README.md",
diff --git a/evals/scripts/_models.js b/evals/scripts/_models.js
@@ -30,7 +30,7 @@ function resolveModel(input) {
 /**
  * Reverse-lookup a friendly alias for a model id, falling back to the model
  * id itself. Used to label per-model output files like
- * `aiconfig-create/results.haiku.json`.
+ * `configs-create/results.haiku.json`.
  */
 function aliasFor(modelId) {
   for (const [alias, id] of Object.entries(MODEL_ALIASES)) {
diff --git a/evals/scripts/aggregate.js b/evals/scripts/aggregate.js
@@ -6,7 +6,7 @@
  * Modes:
  *   node scripts/aggregate.js              # rebuild from existing results.json
  *   node scripts/aggregate.js --run        # run every suite then aggregate
- *   node scripts/aggregate.js --run --only=aiconfig-create,aiconfig-update
+ *   node scripts/aggregate.js --run --only=configs-create,configs-update
  *
  * Exits 0 on success, 1 on failure.
  */
diff --git a/evals/scripts/run-models.js b/evals/scripts/run-models.js
@@ -9,7 +9,7 @@
  *
  * Usage:
  *   node scripts/run-models.js --model=haiku
- *   node scripts/run-models.js --model=sonnet --only=aiconfig-create
+ *   node scripts/run-models.js --model=sonnet --only=configs-create
  *   node scripts/run-models.js --models=haiku,sonnet,opus
  *
  * Output:
@@ -78,7 +78,7 @@ function usage() {
     "Examples:",
     "  npm run eval:haiku",
     "  npm run eval:matrix",
-    "  node scripts/run-models.js --model=haiku --only=aiconfig-create",
+    "  node scripts/run-models.js --model=haiku --only=configs-create",
   ].join("\n");
 }
 

Original file line number	Diff line number	Diff line change
`@@ -6,7 +6,7 @@`
`6`	`6`	`* Modes:`
`7`	`7`	`* node scripts/aggregate.js # rebuild from existing results.json`
`8`	`8`	`* node scripts/aggregate.js --run # run every suite then aggregate`
`9`		`- * node scripts/aggregate.js --run --only=aiconfig-create,aiconfig-update`
	`9`	`+ * node scripts/aggregate.js --run --only=configs-create,configs-update`
`10`	`10`	`*`
`11`	`11`	`* Exits 0 on success, 1 on failure.`
`12`	`12`	`*/`
Original file line number	Diff line number	Diff line change
`@@ -9,7 +9,7 @@`
`9`	`9`	`*`
`10`	`10`	`* Usage:`
`11`	`11`	`* node scripts/run-models.js --model=haiku`
`12`		`- * node scripts/run-models.js --model=sonnet --only=aiconfig-create`
	`12`	`+ * node scripts/run-models.js --model=sonnet --only=configs-create`
`13`	`13`	`* node scripts/run-models.js --models=haiku,sonnet,opus`
`14`	`14`	`*`
`15`	`15`	`* Output:`
`@@ -78,7 +78,7 @@ function usage() {`
`78`	`78`	`"Examples:",`
`79`	`79`	`" npm run eval:haiku",`
`80`	`80`	`" npm run eval:matrix",`
`81`		`- " node scripts/run-models.js --model=haiku --only=aiconfig-create",`
	`81`	`+ " node scripts/run-models.js --model=haiku --only=configs-create",`
`82`	`82`	`].join("\n");`
`83`	`83`	`}`
`84`	`84`