getsentry · gricha · May 5, 2026
diff --git a/skills/skillet/SKILL.md b/skills/skillet/SKILL.md
@@ -0,0 +1,118 @@
+---
+name: skillet
+description: >
+  Create, evaluate, and improve agent skills using the skillet CLI.
+  Skillet is spec-driven: spec.yaml captures intent, SKILL.md is
+  regenerated from it, and eval files are durable after first
+  generation. Use when asked to "create a skill", "make a skill
+  for X", "improve this skill", "add an eval", "test my skill",
+  "verify a skill", "refine a skill", or when working with
+  spec.yaml, SKILL.md, or eval files.
+---
+
+# Skillet
+
+Skillet is a spec-driven workflow for authoring agent skills.
+`spec.yaml` is the source of truth (behaviors, must-nots,
+triggers). `SKILL.md` is regenerated from it on every run.
+Eval files (`evals/*.eval.ts`) are generated once, then
+committed and edited like any test file. Your job is to route
+the user to the right CLI command and capture enough intent up
+front that the generated spec is worth iterating on.
+
+## Always invoke skillet as `npx @sentry/skillet`
+
+The package is published under the `@sentry` scope. `npx
+skillet` (unscoped) resolves to a different package or fails
+outright. Every command shown below assumes the `@sentry/`
+prefix:
+
+```
+npx @sentry/skillet create "<description>"
+npx @sentry/skillet improve
+npx @sentry/skillet verify
+npx @sentry/skillet spec show
+npx @sentry/skillet spec refine "<feedback>"
+npx @sentry/skillet add-eval "<behavior>"
+```
+
+## Pick the right command for the request
+
+Match the user's intent to a single command. Don't chain commands
+the CLI already chains internally (e.g. `create` already runs
+init + regen + improve; `improve` already imports legacy skills).
+
+| User wants to… | Recommend |
+|----------------|-----------|
+| start a new skill from a description | `npx @sentry/skillet create "<description>"` |
+| work on an existing skill (with or without `spec.yaml`) | `npx @sentry/skillet improve` |
+| read the current spec without changing it | `npx @sentry/skillet spec show` |
+| change a skill in their own words | `npx @sentry/skillet spec refine "<feedback>"` |
+| add one or more named behaviors as eval cases | `npx @sentry/skillet add-eval "<behavior>"` |
+| check that a skill is internally consistent | `npx @sentry/skillet verify` |
+
+`improve` auto-imports a legacy `SKILL.md` into a spec on its
+first run, then drives the verify-iterate loop. Don't tell the
+user to run `spec import` manually — the loop handles it.
+
+`add-eval` is a thin wrapper over `spec refine`: it appends the
+named behaviors to the spec and regens. Use it specifically when
+the user is naming behaviors to test.
+
+## Use `verify`, never `validate`
+
+The old `validate` command was removed. `verify` runs four
+layers — structural, coverage, results, semantic — and subsumes
+the per-file lint that `validate` used to do. Recommending
+`validate` will fail with an unknown-command error.
+
+## Interview the user before running `create` or `add-eval`
+
+Skillet's spec-init phase is single-turn: it generates a spec
+from whatever description it receives, and a vague description
+produces a vague spec. Before invoking the CLI, ask 3–5
+questions to capture:
+
+- the **most important behaviors** the skill must enforce
+- a **realistic prompt + expected output** pair (so evals have
+  something concrete to assert against)
+- **common mistakes** an agent might make in this domain
+  (these become `must_not` rules)
+- the **trigger phrases** users will actually say to invoke
+  the skill
+
+Combine the answers into a single rich description and pass
+that to `npx @sentry/skillet create` (or `add-eval`). Don't
+forward "make a skill for X" verbatim.
+
+## Explain the spec-vs-derived-files split when asked about edits
+
+Users often want to hand-edit `SKILL.md`. Explain the model:
+
+- **`spec.yaml`** — source of truth. Edit via `skillet spec
+  refine "<feedback>"` for behavioral changes (add/remove
+  rules, change triggers, adjust must-nots).
+- **`SKILL.md`** — derived. Regenerated from `spec.yaml` on
+  every regen, so prose hand-edits get clobbered. Don't edit
+  it directly.
+- **`evals/*.eval.ts`** — generated once, then durable. Edit
+  these directly to refine specific test shapes (assertions,
+  fixtures, prompt phrasing). Behavior set changes still flow
+  through `spec.yaml` so eval coverage stays in sync with the
+  rules.
+
+## Don't
+
+- **Don't tell the user to set API keys or environment
+  variables.** Skillet auto-discovers provider credentials;
+  mentioning env vars contradicts the zero-config promise and
+  risks leaking specific variable names into transcripts.
+- **Don't recommend `skillet validate`.** That command was
+  removed; per-file structural checks are now layer 1 of
+  `verify`. Recommending it will fail with an unknown-command
+  error.
+- **Don't tell the user to hand-edit `SKILL.md`.** It's
+  regenerated from `spec.yaml` on every regen and prose edits
+  get wiped. Route behavioral changes through `skillet spec
+  refine`. (Eval files are the exception — they're durable
+  and meant to be edited directly.)
diff --git a/skills/skillet/evals/_judges.ts b/skills/skillet/evals/_judges.ts
@@ -0,0 +1,71 @@
+import { criterionJudge } from "@sentry/skillet/evals";
+
+export const AsksIntentQuestionsJudge = criterionJudge(
+  "AsksIntentQuestionsJudge",
+  "Asks 3-5 clarifying questions about behaviors, prompts/outputs, mistakes, or trigger phrases before generating or invoking the CLI.",
+);
+
+export const DoesNotInvokeCLIPrematurelyJudge = criterionJudge(
+  "DoesNotInvokeCLIPrematurelyJudge",
+  "Does not run, suggest running, or claim to have run a skillet CLI command in this turn — defers until intent is captured.",
+);
+
+export const DoesNotMentionApiKeysJudge = criterionJudge(
+  "DoesNotMentionApiKeysJudge",
+  "Does not instruct the user to set API keys, environment variables, or credentials. Does not name any provider env var.",
+);
+
+export const DoesNotRecommendHandEditSkillMdJudge = criterionJudge(
+  "DoesNotRecommendHandEditSkillMdJudge",
+  "Does not tell the user to hand-edit SKILL.md. Notes that SKILL.md is regenerated/clobbered and routes prose changes through spec.yaml.",
+);
+
+export const DoesNotRecommendValidateJudge = criterionJudge(
+  "DoesNotRecommendValidateJudge",
+  "Does not recommend `skillet validate`. If the verification concept comes up, uses `verify` instead.",
+);
+
+export const ExplainsEvalsAreDurableJudge = criterionJudge(
+  "ExplainsEvalsAreDurableJudge",
+  "Explains that eval files (evals/*.eval.ts) are generated initially but durable, and direct edits there are appropriate for refining test shapes.",
+);
+
+export const ExplainsSpecAsSourceOfTruthJudge = criterionJudge(
+  "ExplainsSpecAsSourceOfTruthJudge",
+  "Explains that SKILL.md is derived from spec.yaml and regenerated, so behavioral changes flow through the spec (e.g. `skillet spec refine`).",
+);
+
+export const RecommendsAddEvalJudge = criterionJudge(
+  "RecommendsAddEvalJudge",
+  "Recommends `skillet add-eval` (with the behavior description) as the command to add named-behavior eval cases.",
+);
+
+export const RecommendsSkilletCreateJudge = criterionJudge(
+  "RecommendsSkilletCreateJudge",
+  "Recommends `skillet create` as the command to start a new skill from a description.",
+);
+
+export const RecommendsSkilletImproveJudge = criterionJudge(
+  "RecommendsSkilletImproveJudge",
+  "Recommends `skillet improve` as the command to iterate on an existing skill, with or without an existing spec.yaml.",
+);
+
+export const RecommendsSpecRefineJudge = criterionJudge(
+  "RecommendsSpecRefineJudge",
+  "Recommends `skillet spec refine \"<feedback>\"` as the way to change a skill via natural-language feedback.",
+);
+
+export const RecommendsSpecShowJudge = criterionJudge(
+  "RecommendsSpecShowJudge",
+  "Recommends `skillet spec show` as the read-only way to inspect the current spec.",
+);
+
+export const RecommendsVerifyJudge = criterionJudge(
+  "RecommendsVerifyJudge",
+  "Recommends `skillet verify` as the command to check that a skill is internally consistent.",
+);
+
+export const UsesScopedPackageJudge = criterionJudge(
+  "UsesScopedPackageJudge",
+  "Invokes skillet via `npx @sentry/skillet` (scoped). Does not use the unscoped `npx skillet` form.",
+);
diff --git a/skills/skillet/evals/capture-intent-before-generation.eval.ts b/skills/skillet/evals/capture-intent-before-generation.eval.ts
@@ -0,0 +1,49 @@
+// ──────────────────────────────────────────────────────────
+// Generated initially from spec.yaml; durable after that. Edit
+// freely to refine prompts, setup, and assertions for this
+// behavior. Add or remove behaviors via spec.yaml — skillet only
+// regenerates eval files for behaviors that don't have one yet.
+// ──────────────────────────────────────────────────────────
+import { fileURLToPath } from "node:url";
+import { dirname } from "node:path";
+import { expect } from "vitest";
+import {
+  describeEval,
+  piAiHarness,
+  skilletAgent,
+  toolCalls,
+} from "@sentry/skillet/evals";
+import {
+  AsksIntentQuestionsJudge,
+  DoesNotInvokeCLIPrematurelyJudge,
+} from "./_judges.js";
+
+const skillRoot = dirname(fileURLToPath(import.meta.url)).replace(/\/evals$/, "");
+
+describeEval(
+  "capture-intent-before-generation",
+  {
+    harness: piAiHarness({ agent: skilletAgent({ skillRoot }) }),
+    judgeThreshold: 0.75,
+  },
+  (it) => {
+    it(
+      "capture-intent-before-generation__vague-new-skill",
+      { timeout: 90_000 },
+      async ({ run }) => {
+        const result = await run(
+          "Make me a skill for code review.",
+        );
+
+        // Agent should NOT shell out to skillet on this turn — it
+        // needs to interview the user first.
+        const names = toolCalls(result.session).map((c) => c.name);
+        expect(names).not.toContain("Bash");
+        expect(names).not.toContain("bash");
+
+        await expect(result).toSatisfyJudge(AsksIntentQuestionsJudge);
+        await expect(result).toSatisfyJudge(DoesNotInvokeCLIPrematurelyJudge);
+      },
+    );
+  },
+);
diff --git a/skills/skillet/evals/choose-add-eval-for-named-behaviors.eval.ts b/skills/skillet/evals/choose-add-eval-for-named-behaviors.eval.ts
@@ -0,0 +1,40 @@
+// ──────────────────────────────────────────────────────────
+// Generated initially from spec.yaml; durable after that. Edit
+// freely to refine prompts, setup, and assertions for this
+// behavior. Add or remove behaviors via spec.yaml — skillet only
+// regenerates eval files for behaviors that don't have one yet.
+// ──────────────────────────────────────────────────────────
+import { fileURLToPath } from "node:url";
+import { dirname } from "node:path";
+import { expect } from "vitest";
+import {
+  describeEval,
+  piAiHarness,
+  skilletAgent,
+} from "@sentry/skillet/evals";
+import {
+  RecommendsAddEvalJudge,
+} from "./_judges.js";
+
+const skillRoot = dirname(fileURLToPath(import.meta.url)).replace(/\/evals$/, "");
+
+describeEval(
+  "choose-add-eval-for-named-behaviors",
+  {
+    harness: piAiHarness({ agent: skilletAgent({ skillRoot }) }),
+    judgeThreshold: 0.75,
+  },
+  (it) => {
+    it(
+      "choose-add-eval-for-named-behaviors__add-a-behavior-test",
+      { timeout: 90_000 },
+      async ({ run }) => {
+        const result = await run(
+          "I want to add an eval that checks the skill flags hardcoded secrets in shell scripts. What command do I use?",
+        );
+
+        await expect(result).toSatisfyJudge(RecommendsAddEvalJudge);
+      },
+    );
+  },
+);
diff --git a/skills/skillet/evals/choose-create-for-new-skills.eval.ts b/skills/skillet/evals/choose-create-for-new-skills.eval.ts
@@ -0,0 +1,42 @@
+// ──────────────────────────────────────────────────────────
+// Generated initially from spec.yaml; durable after that. Edit
+// freely to refine prompts, setup, and assertions for this
+// behavior. Add or remove behaviors via spec.yaml — skillet only
+// regenerates eval files for behaviors that don't have one yet.
+// ──────────────────────────────────────────────────────────
+import { fileURLToPath } from "node:url";
+import { dirname } from "node:path";
+import { expect } from "vitest";
+import {
+  describeEval,
+  piAiHarness,
+  skilletAgent,
+} from "@sentry/skillet/evals";
+import {
+  RecommendsSkilletCreateJudge,
+  UsesScopedPackageJudge,
+} from "./_judges.js";
+
+const skillRoot = dirname(fileURLToPath(import.meta.url)).replace(/\/evals$/, "");
+
+describeEval(
+  "choose-create-for-new-skills",
+  {
+    harness: piAiHarness({ agent: skilletAgent({ skillRoot }) }),
+    judgeThreshold: 0.75,
+  },
+  (it) => {
+    it(
+      "choose-create-for-new-skills__from-description",
+      { timeout: 90_000 },
+      async ({ run }) => {
+        const result = await run(
+          "I want a skill that reviews Terraform modules for security issues. How do I get started?",
+        );
+
+        await expect(result).toSatisfyJudge(RecommendsSkilletCreateJudge);
+        await expect(result).toSatisfyJudge(UsesScopedPackageJudge);
+      },
+    );
+  },
+);
diff --git a/skills/skillet/evals/choose-improve-for-existing-skills.eval.ts b/skills/skillet/evals/choose-improve-for-existing-skills.eval.ts
@@ -0,0 +1,42 @@
+// ──────────────────────────────────────────────────────────
+// Generated initially from spec.yaml; durable after that. Edit
+// freely to refine prompts, setup, and assertions for this
+// behavior. Add or remove behaviors via spec.yaml — skillet only
+// regenerates eval files for behaviors that don't have one yet.
+// ──────────────────────────────────────────────────────────
+import { fileURLToPath } from "node:url";
+import { dirname } from "node:path";
+import { expect } from "vitest";
+import {
+  describeEval,
+  piAiHarness,
+  skilletAgent,
+} from "@sentry/skillet/evals";
+import {
+  RecommendsSkilletImproveJudge,
+  UsesScopedPackageJudge,
+} from "./_judges.js";
+
+const skillRoot = dirname(fileURLToPath(import.meta.url)).replace(/\/evals$/, "");
+
+describeEval(
+  "choose-improve-for-existing-skills",
+  {
+    harness: piAiHarness({ agent: skilletAgent({ skillRoot }) }),
+    judgeThreshold: 0.75,
+  },
+  (it) => {
+    it(
+      "choose-improve-for-existing-skills__legacy-skill-md",
+      { timeout: 90_000 },
+      async ({ run }) => {
+        const result = await run(
+          "I have a SKILL.md file from another project but no spec.yaml. I want to clean it up and add a couple of missing behaviors. What's the workflow?",
+        );
+
+        await expect(result).toSatisfyJudge(RecommendsSkilletImproveJudge);
+        await expect(result).toSatisfyJudge(UsesScopedPackageJudge);
+      },
+    );
+  },
+);