diff --git a/.agents/plugins/marketplace.json b/.agents/plugins/marketplace.json index cc38a65c..b118812d 100644 --- a/.agents/plugins/marketplace.json +++ b/.agents/plugins/marketplace.json @@ -430,6 +430,21 @@ "description": "Three-phase Requirements → Design → Tasks workflow for Claude Code and Codex — EARS notation acceptance criteria, autonomous execution loop, cross-spec dependencies, and post-implementation acceptance testing.", "icon": "./plugins/Habib0x0/spec-driven-plugin/assets/spec-driven-icon.svg" }, + { + "name": "staff-engineer-mode", + "displayName": "Staff Engineer Mode", + "source": { + "source": "local", + "path": "./plugins/sirmarkz/staff-engineer-mode" + }, + "policy": { + "installation": "AVAILABLE", + "authentication": "ON_INSTALL" + }, + "category": "Development & Workflow", + "description": "Routes engineering design, delivery, reliability, security, operations, and maintenance prompts to focused staff-level specialist guidance for AI coding agents.", + "icon": "./plugins/sirmarkz/staff-engineer-mode/assets/icon.svg" + }, { "name": "stark", "displayName": "Stark", diff --git a/README.md b/README.md index 1c40299f..04d33c90 100644 --- a/README.md +++ b/README.md @@ -156,6 +156,7 @@ Third-party plugins built by the community. [PRs welcome](#contributing)! - [Secret Guard](./plugins/mturac/secret-guard) - Pre-commit secret scanner using pattern and entropy detection. - [Session Orchestrator](https://github.com/Kanevry/session-orchestrator) - Session orchestration for Claude Code, Codex, and Cursor IDE — structured planning, wave-based execution, VCS integration (GitLab + GitHub), quality gates, and clean session close-out with issue tracking. - [Spec-Driven Development](https://github.com/Habib0x0/spec-driven-plugin) - Three-phase Requirements → Design → Tasks workflow for Claude Code and Codex — EARS notation acceptance criteria, autonomous execution loop, cross-spec dependencies, and post-implementation acceptance testing. +- [Staff Engineer Mode](https://github.com/sirmarkz/staff-engineer-mode) - Routes engineering design, delivery, reliability, security, operations, and maintenance prompts to focused staff-level specialist guidance for AI coding agents. - [Standup Generator](./plugins/mturac/standup-gen) - Daily standup notes from git activity across repos. - [Stark](https://github.com/f0d010c/stark) - UI/UX design plugin for AI coding agents with product-flow routing, platform-native interface guidance, asset planning, and shipped-reference analysis before code. - [tailtest](https://github.com/avansaber/tailtest-codex) - Hook-powered test generation -- detects files changed during an agent turn and instructs Codex to write and run tests automatically. Zero config, 8 languages. diff --git a/plugins.json b/plugins.json index b25f6715..31e28570 100644 --- a/plugins.json +++ b/plugins.json @@ -2,8 +2,8 @@ "$schema": "https://json-schema.org/draft/2020-12/schema", "name": "awesome-codex-plugins", "version": "1.0.0", - "last_updated": "2026-05-20", - "total": 82, + "last_updated": "2026-05-21", + "total": 83, "categories": [ "Development & Workflow", "Tools & Integrations" @@ -299,6 +299,16 @@ "source": "awesome-codex-plugins", "install_url": "https://raw.githubusercontent.com/Habib0x0/spec-driven-plugin/HEAD/.codex-plugin/plugin.json" }, + { + "name": "Staff Engineer Mode", + "url": "https://github.com/sirmarkz/staff-engineer-mode", + "owner": "sirmarkz", + "repo": "staff-engineer-mode", + "description": "Routes engineering design, delivery, reliability, security, operations, and maintenance prompts to focused staff-level specialist guidance for AI coding agents.", + "category": "Development & Workflow", + "source": "awesome-codex-plugins", + "install_url": "https://raw.githubusercontent.com/sirmarkz/staff-engineer-mode/HEAD/.codex-plugin/plugin.json" + }, { "name": "Stark", "url": "https://github.com/f0d010c/stark", diff --git a/plugins/sirmarkz/staff-engineer-mode/.codex-plugin/plugin.json b/plugins/sirmarkz/staff-engineer-mode/.codex-plugin/plugin.json new file mode 100644 index 00000000..830b3516 --- /dev/null +++ b/plugins/sirmarkz/staff-engineer-mode/.codex-plugin/plugin.json @@ -0,0 +1,42 @@ +{ + "name": "staff-engineer-mode", + "version": "1.5.1", + "description": "Staff-engineering decision guidance for AI coding agents. Routes design, development, testing, release, operations, reliability, security, API, data, and platform work to specialist guidance.", + "author": { + "name": "sirmarkz", + "url": "https://github.com/sirmarkz/staff-engineer-mode" + }, + "homepage": "https://github.com/sirmarkz/staff-engineer-mode", + "repository": "https://github.com/sirmarkz/staff-engineer-mode", + "license": "MIT", + "keywords": [ + "staff-engineer", + "engineering-lifecycle", + "devops", + "sre", + "reliability", + "security", + "architecture", + "skills" + ], + "skills": "./skills/", + "interface": { + "displayName": "Staff Engineer Mode", + "shortDescription": "Staff Engineer Mode engineering lifecycle and operations guidance", + "longDescription": "Technology-agnostic guidance for applying large-scale engineering practices to architecture, reliability, resilience, delivery, operations, security, privacy, data, platform, client, and cost-aware reliability work. The router automatically selects the smallest useful routed specialist set from natural-language engineering requests.", + "developerName": "sirmarkz", + "category": "Coding", + "composerIcon": "./assets/icon.svg", + "capabilities": [ + "Interactive", + "Read", + "Write" + ], + "defaultPrompt": [ + "Design the production readiness checks for this service.", + "Make this architecture reliable and operable.", + "Decide the right engineering constraints for this change." + ], + "brandColor": "#1F6FEB" + } +} diff --git a/plugins/sirmarkz/staff-engineer-mode/.codexignore b/plugins/sirmarkz/staff-engineer-mode/.codexignore new file mode 100644 index 00000000..ae44e78e --- /dev/null +++ b/plugins/sirmarkz/staff-engineer-mode/.codexignore @@ -0,0 +1,10 @@ +.git/ +.github/ +.claude/ +.codex/ +.cursor-plugin/ +.opencode/ +__pycache__/ +.pytest_cache/ +node_modules/ +assets/readme/ diff --git a/plugins/sirmarkz/staff-engineer-mode/LICENSE b/plugins/sirmarkz/staff-engineer-mode/LICENSE new file mode 100644 index 00000000..d49e8cd8 --- /dev/null +++ b/plugins/sirmarkz/staff-engineer-mode/LICENSE @@ -0,0 +1,41 @@ +MIT License + +Copyright (c) 2026 sirmarkz + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. + +Project Notice + +Staff Engineer Mode is an independent project. It cites public engineering +writing and standards for context. Named organizations are sources, not +sponsors, maintainers, or project owners. + +The MIT License above covers this repository's original code, skill text, +documentation, scripts, manifests, and templates. + +Third-party articles, papers, documentation, trademarks, logos, product names, +and organization names remain under their owners' terms. + +For contributors: + +- Cite stable source IDs from `skills/_shared/references/source-index.md`. +- Write original summaries and operational guidance. +- Keep quotes short and necessary. +- Do not copy third-party articles, diagrams, logos, or other materials into + this repository. diff --git a/plugins/sirmarkz/staff-engineer-mode/README.md b/plugins/sirmarkz/staff-engineer-mode/README.md new file mode 100644 index 00000000..5322073a --- /dev/null +++ b/plugins/sirmarkz/staff-engineer-mode/README.md @@ -0,0 +1,208 @@ +# Staff Engineer Mode + +[![Release](https://img.shields.io/github/v/release/sirmarkz/staff-engineer-mode?label=release)](./RELEASE-NOTES.md) + +**Your AI coding agent ships fast. This makes it ship with judgment.** + +Staff-level engineering reasoning for design, development, testing, release, and operations decisions. + +## Why Now + +AI coding agents now write material amounts of production code. The bottleneck is no longer how fast the agent writes — it is whether the agent reasoned about what happens when the code runs at 3am. Agents will happily design an endpoint without a compatibility plan, implement a migration with no rollback, add tests that miss the failure mode, or ship a config change with no canary. This pack closes that gap. + +## How It Works + +Ask a normal engineering question. Hand the agent a task, design, diff, incident, rollout, or maintenance problem. The router reads the work, picks one specialist (occasionally one secondary), reads that specialist file, and returns concrete decisions, risks, checks, owners, supporting details, and next steps. You never name a specialist. + +Supported tools should list only the native `staff-engineer-mode` router. Specialist files live under `specialists/` and load only after routing. + +The router refuses to load every plausible specialist. One primary specialist at a time, by default. + +See [SAMPLE-PROMPTS.md](SAMPLE-PROMPTS.md) for prompts across every specialist. + +## What It Looks Like + +These are real Claude Code captures from public codebases. The prompt is plain +developer wording; Staff Engineer Mode routes to one specialist, loads that +specialist file, and returns a structured engineering answer. + +
+API compatibility review + +Claude Code capture showing an API compatibility review for an account API response change + +
+ +
+Backup and recovery review + +Claude Code capture showing a backup and recovery review for a database restore plan + +
+ +
+Code readability for agents review + +Claude Code capture showing a code readability for agents review for repo areas an AI coding agent may misunderstand + +
+ +
+Dependency and code hygiene plan + +Claude Code capture showing a dependency and code hygiene plan for stale dependencies and dead helper code + +
+ +
+Dependency resilience review + +Claude Code capture showing a dependency resilience review for remote media fetch timeouts, retries, and fallback behavior + +
+ +
+High availability design + +Claude Code capture showing a high availability design review for a multi-region control plane + +
+ +
+Observability and alerting review + +Claude Code capture showing an observability and alerting review for API server request errors and latency + +
+ +
+Performance and capacity review + +Claude Code capture showing a performance and capacity review for an account lookup endpoint + +
+ +
+Production readiness before launch + +Claude Code capture showing a production readiness review for a media processing queue split + +
+ +## Installation + +### Claude Code + +Register the marketplace: + +```text +/plugin marketplace add https://github.com/sirmarkz/staff-engineer-mode.git +``` + +Install the plugin: + +```text +/plugin install staff-engineer-mode@staff-engineer-mode +``` + +### Codex + +Works with Codex CLI and Codex App. Tell Codex: + +```text +Fetch and follow instructions from https://raw.githubusercontent.com/sirmarkz/staff-engineer-mode/main/.codex/INSTALL.md +``` + +### Cursor + +```text +/add-plugin staff-engineer-mode +``` + +### OpenCode + +Works with OpenCode. Tell OpenCode: + +```text +Fetch and follow instructions from https://raw.githubusercontent.com/sirmarkz/staff-engineer-mode/main/.opencode/INSTALL.md +``` + +### GitHub Copilot CLI + +Register the marketplace: + +```bash +copilot plugin marketplace add https://github.com/sirmarkz/staff-engineer-mode.git +``` + +Install the plugin: + +```bash +copilot plugin install staff-engineer-mode@staff-engineer-mode +``` + +### Gemini CLI + +```bash +gemini extensions install https://github.com/sirmarkz/staff-engineer-mode +``` + +## Verify + +Start a fresh session inside any open repo and ask one of: + +- "Before implementing partner webhooks, design the event contract, delivery retries, replay path, and dead-letter handling." +- "During development of the checkout inventory call, decide timeout, retry, fallback, and duplicate-work safeguards." +- "Review my last commit and tell me what you would catch in PR review." + +The agent should load the router, choose one specialist, and respond with concrete decisions, risks, checks, owners, supporting details, and next steps — not vibes. + +## What's Inside + +One native router skill: `staff-engineer-mode`. It routes to 54 specialist +files under `specialists/`; those files are not installed or listed as separate +native skills. + +Examples by surface (the full catalog with prompts for every specialist file is in [SAMPLE-PROMPTS.md](SAMPLE-PROMPTS.md)): + +| Surface | Example specialist files | +| --- | --- | +| Architecture and interfaces | [`architecture-decisions`](specialists/architecture-decisions.md), [`api-design-and-compatibility`](specialists/api-design-and-compatibility.md), [`data-contracts`](specialists/data-contracts.md), [`state-machine-correctness`](specialists/state-machine-correctness.md) | +| Reliability and resilience | [`slo-and-error-budgets`](specialists/slo-and-error-budgets.md), [`high-availability-design`](specialists/high-availability-design.md), [`dependency-resilience`](specialists/dependency-resilience.md), [`backup-and-recovery`](specialists/backup-and-recovery.md), [`resilience-experiments`](specialists/resilience-experiments.md), [`performance-and-capacity`](specialists/performance-and-capacity.md) | +| Delivery and change safety | [`progressive-delivery`](specialists/progressive-delivery.md), [`feature-flag-lifecycle`](specialists/feature-flag-lifecycle.md), [`release-build-reproducibility`](specialists/release-build-reproducibility.md), [`testing-and-quality-gates`](specialists/testing-and-quality-gates.md), [`test-data-engineering`](specialists/test-data-engineering.md), [`dev-environment-parity`](specialists/dev-environment-parity.md), [`migration-and-deprecation`](specialists/migration-and-deprecation.md), [`code-readability-for-agents`](specialists/code-readability-for-agents.md), [`dependency-and-code-hygiene`](specialists/dependency-and-code-hygiene.md), [`configuration-and-automation-safety`](specialists/configuration-and-automation-safety.md), [`fleet-upgrades`](specialists/fleet-upgrades.md) | +| Operations and observability | [`observability-and-alerting`](specialists/observability-and-alerting.md) | +| Security and privacy | [`secure-sdlc-and-threat-modeling`](specialists/secure-sdlc-and-threat-modeling.md), [`identity-and-secrets`](specialists/identity-and-secrets.md), [`cryptography-and-key-lifecycle`](specialists/cryptography-and-key-lifecycle.md), [`software-supply-chain-security`](specialists/software-supply-chain-security.md), [`vulnerability-management`](specialists/vulnerability-management.md), [`tenant-isolation`](specialists/tenant-isolation.md), [`privacy-and-data-lifecycle`](specialists/privacy-and-data-lifecycle.md) | +| Data and workflow systems | [`distributed-data-and-consistency`](specialists/distributed-data-and-consistency.md), [`database-operations`](specialists/database-operations.md), [`event-workflows`](specialists/event-workflows.md), [`data-pipeline-reliability`](specialists/data-pipeline-reliability.md), [`caching-and-derived-data`](specialists/caching-and-derived-data.md) | +| Platform and edge | [`infrastructure-and-policy-as-code`](specialists/infrastructure-and-policy-as-code.md), [`internal-service-networking`](specialists/internal-service-networking.md), [`edge-traffic-and-ddos-defense`](specialists/edge-traffic-and-ddos-defense.md), [`cost-aware-reliability`](specialists/cost-aware-reliability.md) | +| Client, ML/AI, and experimentation | [`web-release-gates`](specialists/web-release-gates.md), [`mobile-release-engineering`](specialists/mobile-release-engineering.md), [`accessibility-gates`](specialists/accessibility-gates.md), [`llm-application-security`](specialists/llm-application-security.md), [`llm-evaluation`](specialists/llm-evaluation.md), [`llm-serving-cost-and-latency`](specialists/llm-serving-cost-and-latency.md), [`ml-reliability-and-evaluation`](specialists/ml-reliability-and-evaluation.md), [`experimentation-and-metric-guardrails`](specialists/experimentation-and-metric-guardrails.md) | +| Engineering workflow, readiness, and controls | [`agent-pr-review`](specialists/agent-pr-review.md), [`ai-coding-governance`](specialists/ai-coding-governance.md), [`documentation-lifecycle`](specialists/documentation-lifecycle.md), [`engineering-control-evidence`](specialists/engineering-control-evidence.md), [`production-readiness-review`](specialists/production-readiness-review.md), [`incident-response-and-postmortems`](specialists/incident-response-and-postmortems.md), [`oncall-health`](specialists/oncall-health.md), [`platform-golden-paths`](specialists/platform-golden-paths.md) | + +Every specialist file appears in [SAMPLE-PROMPTS.md](SAMPLE-PROMPTS.md) with four representative prompts. + +## Compared To Alternatives + +Staff Engineer Mode can be used alongside workflow skills like Superpowers. +Workflow skills shape how the agent plans and executes; Staff Engineer Mode +shapes the engineering checks and decisions it applies to the work. + +## Contributing + +Patches welcome — especially additional practices from authoritative sources: first-party engineering publications, official documentation, standards bodies, peer-reviewed papers, or widely cited practitioner references. + +New specialist files must be technology-agnostic, cite stable source IDs, and avoid vendor endorsement. Read [STYLE.md](STYLE.md) before opening a PR. The voice is enforced. + +## Maintainers + +See [MAINTAINERS.md](MAINTAINERS.md). + +## Sources And Influences + +This pack focuses on the intersection of the strongest publicly documented engineering practices from leading software engineering organizations. It synthesizes large-operator engineering writing (Google, Amazon, Meta, Microsoft, Apple, Netflix) and standards work cited by their teams (NIST, CISA, OWASP, OpenSSF, IETF, W3C). Specific source IDs are in `skills/_shared/references/source-index.md`. This is an independent project; nothing here is endorsed by or affiliated with those organizations. + +## License + +MIT — see [LICENSE](LICENSE). The project notice is included there. + +--- + +*Fewer vibes. More engineering.* diff --git a/plugins/sirmarkz/staff-engineer-mode/assets/icon.svg b/plugins/sirmarkz/staff-engineer-mode/assets/icon.svg new file mode 100644 index 00000000..c5bc9724 --- /dev/null +++ b/plugins/sirmarkz/staff-engineer-mode/assets/icon.svg @@ -0,0 +1,13 @@ + + Staff Engineer Mode + A routing hub connects three engineering decision nodes. + + + + + + + + + + diff --git a/plugins/sirmarkz/staff-engineer-mode/package.json b/plugins/sirmarkz/staff-engineer-mode/package.json new file mode 100644 index 00000000..06326566 --- /dev/null +++ b/plugins/sirmarkz/staff-engineer-mode/package.json @@ -0,0 +1,27 @@ +{ + "name": "staff-engineer-mode", + "version": "1.5.1", + "type": "module", + "main": ".opencode/plugins/staff-engineer-mode.js", + "description": "Staff-engineering decision guidance for AI coding agents. Routes design, development, testing, release, operations, reliability, security, API, data, and platform work to specialist guidance.", + "license": "MIT", + "repository": "https://github.com/sirmarkz/staff-engineer-mode", + "homepage": "https://github.com/sirmarkz/staff-engineer-mode", + "author": "sirmarkz", + "keywords": [ + "skills", + "staff-engineer", + "engineering-lifecycle", + "devops", + "sre", + "reliability", + "security", + "architecture", + "claude-code", + "codex", + "cursor", + "copilot-cli", + "gemini-cli", + "opencode" + ] +} diff --git a/plugins/sirmarkz/staff-engineer-mode/skills/_shared/assets/templates/accessibility-release-check.md b/plugins/sirmarkz/staff-engineer-mode/skills/_shared/assets/templates/accessibility-release-check.md new file mode 100644 index 00000000..97b31353 --- /dev/null +++ b/plugins/sirmarkz/staff-engineer-mode/skills/_shared/assets/templates/accessibility-release-check.md @@ -0,0 +1,13 @@ +# Accessibility Release Check + +## Target + +## Critical Journeys + +| Journey | Automated Check | Manual Check | Blocking Issues | Repair Path | Retest | +| --- | --- | --- | --- | --- | --- | + +## Exceptions + +| Issue | Severity | Compensating Path | Repair Path | Expiry | +| --- | --- | --- | --- | --- | diff --git a/plugins/sirmarkz/staff-engineer-mode/skills/_shared/assets/templates/adr.md b/plugins/sirmarkz/staff-engineer-mode/skills/_shared/assets/templates/adr.md new file mode 100644 index 00000000..0c384e78 --- /dev/null +++ b/plugins/sirmarkz/staff-engineer-mode/skills/_shared/assets/templates/adr.md @@ -0,0 +1,11 @@ +# Architecture Decision Record + +## Context + +## Decision + +## Status + +## Consequences + +## Evidence diff --git a/plugins/sirmarkz/staff-engineer-mode/skills/_shared/assets/templates/ai-coding-instructions.md b/plugins/sirmarkz/staff-engineer-mode/skills/_shared/assets/templates/ai-coding-instructions.md new file mode 100644 index 00000000..c54c7956 --- /dev/null +++ b/plugins/sirmarkz/staff-engineer-mode/skills/_shared/assets/templates/ai-coding-instructions.md @@ -0,0 +1,15 @@ +# AI Coding Instructions + +## Allowed Work + +## Protected Paths + +## Data Boundaries + +## Required Evidence + +## Review Rules + +## Dependency Rules + +## Exceptions diff --git a/plugins/sirmarkz/staff-engineer-mode/skills/_shared/assets/templates/architecture-review.md b/plugins/sirmarkz/staff-engineer-mode/skills/_shared/assets/templates/architecture-review.md new file mode 100644 index 00000000..7c7ce774 --- /dev/null +++ b/plugins/sirmarkz/staff-engineer-mode/skills/_shared/assets/templates/architecture-review.md @@ -0,0 +1,16 @@ +# Architecture Review + +## Context + +## Goals And Non-Goals + +## Risks + +| Risk | Impact | Mitigation | Responsibility Path | Evidence | +| --- | --- | --- | --- | --- | + +## Synthesized Default + +## Exceptions + +## Follow-Up Routes diff --git a/plugins/sirmarkz/staff-engineer-mode/skills/_shared/assets/templates/cert-lifecycle-plan.md b/plugins/sirmarkz/staff-engineer-mode/skills/_shared/assets/templates/cert-lifecycle-plan.md new file mode 100644 index 00000000..6e78c9f2 --- /dev/null +++ b/plugins/sirmarkz/staff-engineer-mode/skills/_shared/assets/templates/cert-lifecycle-plan.md @@ -0,0 +1,14 @@ +# Certificate And Crypto Lifecycle Plan + +## Inventory + +| Item | Use | Responsibility Path | Consumers | Expires | Rotation Path | Monitoring | +| --- | --- | --- | --- | --- | --- | --- | + +## Compatibility + +## Renewal And Rotation + +## Emergency Revocation + +## Transition And Retirement diff --git a/plugins/sirmarkz/staff-engineer-mode/skills/_shared/assets/templates/check-before-moving-on.md b/plugins/sirmarkz/staff-engineer-mode/skills/_shared/assets/templates/check-before-moving-on.md new file mode 100644 index 00000000..1ea54c2f --- /dev/null +++ b/plugins/sirmarkz/staff-engineer-mode/skills/_shared/assets/templates/check-before-moving-on.md @@ -0,0 +1,9 @@ +# Check Before Moving On + +## Check + +## Check Command Or Condition + +## Expected Result + +## Failure Response diff --git a/plugins/sirmarkz/staff-engineer-mode/skills/_shared/assets/templates/configuration-safety-review.md b/plugins/sirmarkz/staff-engineer-mode/skills/_shared/assets/templates/configuration-safety-review.md new file mode 100644 index 00000000..187d3443 --- /dev/null +++ b/plugins/sirmarkz/staff-engineer-mode/skills/_shared/assets/templates/configuration-safety-review.md @@ -0,0 +1,20 @@ +# Configuration Safety Review + +## Change + +## Responsibility Path + +## Contract + +| Setting | Meaning | Default | Bounds | Unsafe Combinations | +| --- | --- | --- | --- | --- | + +## Validation + +## Preview + +## Blast Radius + +## Recovery + +## Drift And Exceptions diff --git a/plugins/sirmarkz/staff-engineer-mode/skills/_shared/assets/templates/data-contract.md b/plugins/sirmarkz/staff-engineer-mode/skills/_shared/assets/templates/data-contract.md new file mode 100644 index 00000000..ba5cb1c0 --- /dev/null +++ b/plugins/sirmarkz/staff-engineer-mode/skills/_shared/assets/templates/data-contract.md @@ -0,0 +1,18 @@ +# Data Contract + +## Boundary + +## Producer + +## Consumers + +## Fields + +| Field | Meaning | Required | Default | Validity Rule | Sensitive | Responsibility Path | +| --- | --- | --- | --- | --- | --- | --- | + +## Compatibility Rules + +## Validation + +## Deprecation And Migration diff --git a/plugins/sirmarkz/staff-engineer-mode/skills/_shared/assets/templates/dependency-matrix.md b/plugins/sirmarkz/staff-engineer-mode/skills/_shared/assets/templates/dependency-matrix.md new file mode 100644 index 00000000..883f63dd --- /dev/null +++ b/plugins/sirmarkz/staff-engineer-mode/skills/_shared/assets/templates/dependency-matrix.md @@ -0,0 +1,4 @@ +# Dependency Matrix + +| Dependency | Operation | Timeout | Retry | Idempotency | Failure Behavior | Response Path | +| --- | --- | --- | --- | --- | --- | --- | diff --git a/plugins/sirmarkz/staff-engineer-mode/skills/_shared/assets/templates/documentation-lifecycle.md b/plugins/sirmarkz/staff-engineer-mode/skills/_shared/assets/templates/documentation-lifecycle.md new file mode 100644 index 00000000..5764aef5 --- /dev/null +++ b/plugins/sirmarkz/staff-engineer-mode/skills/_shared/assets/templates/documentation-lifecycle.md @@ -0,0 +1,10 @@ +# Documentation Lifecycle + +| Doc | Audience | Job | Responsibility Path | Source Of Truth | Review Trigger | Archive Rule | +| --- | --- | --- | --- | --- | --- | --- | + +## Required Docs + +## Stale Docs + +## Findability Check diff --git a/plugins/sirmarkz/staff-engineer-mode/skills/_shared/assets/templates/eval-harness-spec.md b/plugins/sirmarkz/staff-engineer-mode/skills/_shared/assets/templates/eval-harness-spec.md new file mode 100644 index 00000000..25b0bb97 --- /dev/null +++ b/plugins/sirmarkz/staff-engineer-mode/skills/_shared/assets/templates/eval-harness-spec.md @@ -0,0 +1,16 @@ +# Evaluation Harness Spec + +## Decision + +## Cases + +| Case Set | Source | Slice | Expected Behavior | Blocking | +| --- | --- | --- | --- | --- | + +## Scoring + +## Thresholds + +## Versioned Inputs + +## Failure Triage diff --git a/plugins/sirmarkz/staff-engineer-mode/skills/_shared/assets/templates/experiment-guardrail-plan.md b/plugins/sirmarkz/staff-engineer-mode/skills/_shared/assets/templates/experiment-guardrail-plan.md new file mode 100644 index 00000000..3e7f2ad6 --- /dev/null +++ b/plugins/sirmarkz/staff-engineer-mode/skills/_shared/assets/templates/experiment-guardrail-plan.md @@ -0,0 +1,16 @@ +# Experiment Guardrail Plan + +## Hypothesis + +## Assignment And Exposure + +## Metrics + +| Metric | Type | Definition | Response Path | Blocks Ramp | +| --- | --- | --- | --- | --- | + +## Validity Checks + +## Ramp And Readout Rules + +## Decision Record diff --git a/plugins/sirmarkz/staff-engineer-mode/skills/_shared/assets/templates/incident-postmortem.md b/plugins/sirmarkz/staff-engineer-mode/skills/_shared/assets/templates/incident-postmortem.md new file mode 100644 index 00000000..2692a016 --- /dev/null +++ b/plugins/sirmarkz/staff-engineer-mode/skills/_shared/assets/templates/incident-postmortem.md @@ -0,0 +1,15 @@ +# Incident Postmortem + +## Summary + +## Impact + +## Timeline + +## Contributing Factors + +## What Went Well + +## What Went Poorly + +## Action Items diff --git a/plugins/sirmarkz/staff-engineer-mode/skills/_shared/assets/templates/prr-checklist.md b/plugins/sirmarkz/staff-engineer-mode/skills/_shared/assets/templates/prr-checklist.md new file mode 100644 index 00000000..fb2cacfe --- /dev/null +++ b/plugins/sirmarkz/staff-engineer-mode/skills/_shared/assets/templates/prr-checklist.md @@ -0,0 +1,17 @@ +# Production Readiness Review + +## Responsibility + +## SLOs + +## Observability + +## Safe Change + +## Security + +## Capacity + +## Recovery + +## Exceptions diff --git a/plugins/sirmarkz/staff-engineer-mode/skills/_shared/assets/templates/rollout-plan.md b/plugins/sirmarkz/staff-engineer-mode/skills/_shared/assets/templates/rollout-plan.md new file mode 100644 index 00000000..0ece313c --- /dev/null +++ b/plugins/sirmarkz/staff-engineer-mode/skills/_shared/assets/templates/rollout-plan.md @@ -0,0 +1,15 @@ +# Rollout Plan + +## Change + +## Blast Radius + +## Rollout Stages + +## Canary Metrics + +## Stop Criteria + +## Rollback + +## Cleanup diff --git a/plugins/sirmarkz/staff-engineer-mode/skills/_shared/assets/templates/slo-table.md b/plugins/sirmarkz/staff-engineer-mode/skills/_shared/assets/templates/slo-table.md new file mode 100644 index 00000000..69025e42 --- /dev/null +++ b/plugins/sirmarkz/staff-engineer-mode/skills/_shared/assets/templates/slo-table.md @@ -0,0 +1,4 @@ +# SLO Table + +| Journey | Response Path | SLI | SLO | Window | Error Budget | Alert Policy | +| --- | --- | --- | --- | --- | --- | --- | diff --git a/plugins/sirmarkz/staff-engineer-mode/skills/_shared/assets/templates/support-window-inventory.md b/plugins/sirmarkz/staff-engineer-mode/skills/_shared/assets/templates/support-window-inventory.md new file mode 100644 index 00000000..1eb96b07 --- /dev/null +++ b/plugins/sirmarkz/staff-engineer-mode/skills/_shared/assets/templates/support-window-inventory.md @@ -0,0 +1,8 @@ +# Support Window Inventory + +| Component | Responsibility Path | Version | Support Ends | Criticality | Exception | Migration Date | +| --- | --- | --- | --- | --- | --- | --- | + +## Unknowns + +## Unsupported Components diff --git a/plugins/sirmarkz/staff-engineer-mode/skills/_shared/assets/templates/threat-model.md b/plugins/sirmarkz/staff-engineer-mode/skills/_shared/assets/templates/threat-model.md new file mode 100644 index 00000000..3d2b724f --- /dev/null +++ b/plugins/sirmarkz/staff-engineer-mode/skills/_shared/assets/templates/threat-model.md @@ -0,0 +1,10 @@ +# Threat Model + +## Trust Boundaries + +## Data Flows + +## Threats + +| Threat | Control | Verification | Response Path | Residual Risk | +| --- | --- | --- | --- | --- | diff --git a/plugins/sirmarkz/staff-engineer-mode/skills/_shared/assets/templates/upgrade-readiness-matrix.md b/plugins/sirmarkz/staff-engineer-mode/skills/_shared/assets/templates/upgrade-readiness-matrix.md new file mode 100644 index 00000000..b3aada17 --- /dev/null +++ b/plugins/sirmarkz/staff-engineer-mode/skills/_shared/assets/templates/upgrade-readiness-matrix.md @@ -0,0 +1,10 @@ +# Upgrade Readiness Matrix + +| Component | Responsibility Path | Current Version | Target Version | Support Status | Skew Allowed | Test Evidence | Batch | +| --- | --- | --- | --- | --- | --- | --- | --- | + +## Rollout Order + +## Rollback Or Roll-Forward + +## Exceptions diff --git a/plugins/sirmarkz/staff-engineer-mode/skills/_shared/assets/templates/version-skew-policy.md b/plugins/sirmarkz/staff-engineer-mode/skills/_shared/assets/templates/version-skew-policy.md new file mode 100644 index 00000000..46b9626d --- /dev/null +++ b/plugins/sirmarkz/staff-engineer-mode/skills/_shared/assets/templates/version-skew-policy.md @@ -0,0 +1,12 @@ +# Version Skew Policy + +## Scope + +## Allowed Combinations + +| Producer | Consumer | Old Version | New Version | Supported Duration | Evidence | +| --- | --- | --- | --- | --- | --- | + +## Blocked Combinations + +## Retirement Check diff --git a/plugins/sirmarkz/staff-engineer-mode/skills/_shared/references/skill-contract.md b/plugins/sirmarkz/staff-engineer-mode/skills/_shared/references/skill-contract.md new file mode 100644 index 00000000..61b3f264 --- /dev/null +++ b/plugins/sirmarkz/staff-engineer-mode/skills/_shared/references/skill-contract.md @@ -0,0 +1,67 @@ +# Staff Engineer Mode Skill Contract + +Every specialist file must be concise, triggerable, and artifact-oriented. + +## Required SKILL.md Shape + +- YAML frontmatter with `name` and trigger-only `description`. +- `# Skill Name` +- `## Overview` +- `## Iron Law` +- `## When To Use` +- `## When Not To Use` +- `## Info To Gather` +- `## Workflow` +- `## Synthesized Default` +- `## Phase Behavior` +- `## Exceptions` +- `## Response Quality Bar` +- `## Required Outputs` +- `## Checks Before Moving On` +- `## Red Flags - Stop And Rework` +- `## Common Mistakes` + +Do not add per-specialist source, reference, bibliography, citation, or reading-list +sections. Source synthesis belongs in shared reference notes, not in published +skill instructions. + +Every specialist Response Quality Bar must require technology-agnostic guidance +by default: do not introduce provider, product, framework, database, protocol, +or command names unless the user supplied them or explicitly requested +tool-specific guidance. + +Every specialist must state lifecycle behavior: + +- Ideation: identify risks, defaults, unknowns, options, and the next decision before code exists. +- Design: shape the target artifact, tradeoffs, checks, and details to gather. +- Development: guide sequencing, code boundaries, checks, and acceptance criteria. +- Testing: define release-blocking tests, evals, fixtures, and failure probes. +- Release: define rollout, observability, abort, rollback, and readiness details. +- Maintenance: define owners, drift checks, cleanup triggers, and refresh cadence. +- Existing artifact: use current code, docs, telemetry, incidents, or diffs as context for the next engineering decision; do not wait for a finished artifact before guiding design, build, release, or operation. +- Missing details: state assumptions and say what to check next instead of blocking lifecycle guidance. + +Non-exception specialists must not be written as after-the-fact audit or review specialists. They must +guide the next decision from context, artifact, surface, risk, and available +details, even when the prompt does not name a formal phase. + +## Output Schema + +- `context` +- `risk_register` +- `synthesized_default` +- `exceptions` +- `standard_decisions` +- `required_artifacts` +- `checks_before_moving_on` +- `follow_up_routes` + +## Routing Rules + +- Prefer one primary specialist. +- Recommend at most two follow-up routes. +- Infer artifact, phase, surface, and risk from prompt, repo, files, branch context, and conversation before withholding routing. +- Do not ask intake questions for artifact, phase, surface, or risk; withhold routing only when no in-scope engineering lifecycle/control frame is present. +- Do not route to out-of-scope business, marketing, legal, procurement, staffing, compensation, or broad compliance-program work. +- Eval-harness routing blocks are only for confident in-scope routing; low-confidence, ambiguous, and out-of-scope prompts must not emit routing blocks. +- Keep the router `SKILL.md` compact; detailed routing boundary notes belong in `skills/staff-engineer-mode/references/routing-matrix.md`. diff --git a/plugins/sirmarkz/staff-engineer-mode/skills/_shared/references/source-index.md b/plugins/sirmarkz/staff-engineer-mode/skills/_shared/references/source-index.md new file mode 100644 index 00000000..7857b4ce --- /dev/null +++ b/plugins/sirmarkz/staff-engineer-mode/skills/_shared/references/source-index.md @@ -0,0 +1,415 @@ +# Staff Engineer Mode Source Index + +## Source Quality Policy + +Use primary sources whenever available: first-party engineering publications, +official cloud/vendor documentation, standards bodies, peer-reviewed papers, or +widely cited practitioner references that originated a named pattern. Vendor and +company engineering blogs are acceptable only as large-scale case studies or +original pattern writeups, not as unchecked marketing claims. Do not add +encyclopedias, Q&A/forum threads, scraped mirrors, SEO summaries, anonymous +content farms, or unmaintained unofficial copies when a primary source exists. + +Sections below are grouped by source owner: company, project, standards body, +publisher, or named author. They are not grouped by skill topic. + +### ACM Queue +- [S190] ACM Queue - Systems Correctness Practices at AWS: https://queue.acm.org/detail.cfm?id=3712057 + +### ADR GitHub Organization +- [S106] ADR GitHub organization and templates: https://adr.github.io/ + +### Alistair Cockburn +- [S110] Alistair Cockburn - Hexagonal Architecture: https://alistair.cockburn.us/hexagonal-architecture + +### Amazon And AWS +- [S30] AWS Well-Architected Framework: https://docs.aws.amazon.com/wellarchitected/latest/framework/welcome.html +- [S31] AWS Well-Architected - Operational Excellence Pillar: https://docs.aws.amazon.com/wellarchitected/latest/operational-excellence-pillar/welcome.html +- [S32] AWS Well-Architected - Reliability Pillar: https://docs.aws.amazon.com/wellarchitected/latest/reliability-pillar/welcome.html +- [S33] AWS Well-Architected - Security Pillar: https://docs.aws.amazon.com/wellarchitected/latest/security-pillar/welcome.html +- [S34] AWS Builders' Library - Timeouts, Retries, and Backoff with Jitter: https://aws.amazon.com/builders-library/timeouts-retries-and-backoff-with-jitter/ +- [S35] AWS Builders' Library - Static Stability Using Availability Zones: https://aws.amazon.com/builders-library/static-stability-using-availability-zones/ +- [S36] AWS Builders' Library - Using Load Shedding to Avoid Overload: https://aws.amazon.com/builders-library/using-load-shedding-to-avoid-overload/ +- [S37] AWS Builders' Library - Avoiding Overload in Distributed Systems by Putting the Smaller Service in Control: https://aws.amazon.com/builders-library/avoiding-overload-in-distributed-systems-by-putting-the-smaller-service-in-control/ +- [S38] AWS Builders' Library - Avoiding Insurmountable Queue Backlogs: https://aws.amazon.com/builders-library/avoiding-insurmountable-queue-backlogs/ +- [S39] AWS Builders' Library - Implementing Health Checks: https://aws.amazon.com/builders-library/implementing-health-checks/ +- [S40] AWS Builders' Library - Leader Election in Distributed Systems: https://aws.amazon.com/builders-library/leader-election-in-distributed-systems/ +- [S41] AWS Builders' Library - Making Retries Safe with Idempotent APIs: https://aws.amazon.com/builders-library/making-retries-safe-with-idempotent-APIs/ +- [S42] AWS Builders' Library - Reliability and Constant Work: https://aws.amazon.com/builders-library/reliability-and-constant-work/ +- [S43] AWS Builders' Library - Workload Isolation Using Shuffle-Sharding: https://aws.amazon.com/builders-library/workload-isolation-using-shuffle-sharding/ +- [S44] AWS Builders' Library - Automating Safe, Hands-Off Deployments: https://aws.amazon.com/builders-library/automating-safe-hands-off-deployments/ +- [S45] AWS Architecture Blog - Disaster Recovery Strategies for Recovery in the Cloud: https://aws.amazon.com/blogs/architecture/disaster-recovery-dr-architecture-on-aws-part-i-strategies-for-recovery-in-the-cloud/ +- [S46] AWS SaaS Tenant Isolation Strategies: https://d1.awsstatic.com/whitepapers/saas-tenant-isolation-strategies.pdf +- [S47] Amazon Dynamo: Amazon's Highly Available Key-value Store: https://www.allthingsdistributed.com/files/amazon-dynamo-sosp2007.pdf +- [S97] AWS Builders' Library - Avoiding Fallback in Distributed Systems: https://aws.amazon.com/builders-library/avoiding-fallback-in-distributed-systems/ +- [S151] DynamoDB partition key best practices: https://docs.aws.amazon.com/amazondynamodb/latest/developerguide/bp-partition-key-design.html +- [S182] AWS Best Practices for DDoS Resiliency: https://docs.aws.amazon.com/whitepapers/latest/aws-best-practices-ddos-resiliency/aws-best-practices-ddos-resiliency.html +- [S213] Amazon Science - How Amazon Web Services Uses Formal Methods: https://www.amazon.science/publications/how-amazon-web-services-uses-formal-methods +- [S261] Amazon EKS - Kubernetes Versions: https://docs.aws.amazon.com/eks/latest/userguide/kubernetes-versions.html +- [S284] AWS Builders' Library - Ensuring Rollback Safety During Deployments: https://aws.amazon.com/builders-library/ensuring-rollback-safety-during-deployments/ +- [S285] AWS Builders' Library - Instrumenting Distributed Systems for Operational Visibility: https://aws.amazon.com/builders-library/instrumenting-distributed-systems-for-operational-visibility/ +- [S286] AWS Builders' Library - Building Dashboards for Operational Visibility: https://aws.amazon.com/builders-library/building-dashboards-for-operational-visibility/ +- [S287] AWS Builders' Library - Going Faster with Continuous Delivery: https://aws.amazon.com/builders-library/going-faster-with-continuous-delivery/ +- [S288] AWS Builders' Library - Using Dependency Isolation to Contain Concurrency Overload: https://aws.amazon.com/builders-library/dependency-isolation/ +- [S289] AWS Builders' Library - Minimizing Correlated Failures in Distributed Systems: https://aws.amazon.com/builders-library/minimizing-correlated-failures-in-distributed-systems/ +- [S290] AWS Builders' Library - Caching Challenges and Strategies: https://aws.amazon.com/builders-library/caching-challenges-and-strategies/ + +### Anthropic +- [S259] Anthropic Docs - Create Strong Empirical Evaluations: https://docs.anthropic.com/en/docs/test-and-evaluate/develop-tests + +### Apache Cassandra +- [S154] Cassandra Data Modeling: https://cassandra.apache.org/doc/latest/cassandra/developing/data-modeling/intro.html + +### Apple +- [S130] Apple Platform Security: https://support.apple.com/guide/security/welcome/web +- [S131] Apple Secure Coding Guide: https://developer.apple.com/library/archive/documentation/Security/Conceptual/SecureCodingGuide/ +- [S132] Apple Security Research - Private Cloud Compute: https://security.apple.com/blog/private-cloud-compute/ +- [S176] Apple App Store Connect - Release a version update in phases: https://developer.apple.com/help/app-store-connect/update-your-app/release-a-version-update-in-phases +- [S179] Apple MetricKit: https://developer.apple.com/documentation/metrickit +- [S210] Apple - Privacy Features: https://www.apple.com/privacy/features/ + +### Argo CD +- [S122] Argo CD Documentation: https://argo-cd.readthedocs.io/ + +### AsyncAPI Initiative +- [S262] AsyncAPI Specification: https://www.asyncapi.com/docs/reference/specification/latest + +### Backstage +- [S123] Backstage Documentation: https://backstage.io/docs/ + +### Brendan Gregg +- [S143] Brendan Gregg - USE Method and Flame Graphs: https://www.brendangregg.com/usemethod.html + +### CA/Browser Forum +- [S263] CA/Browser Forum - Baseline Requirements for TLS Server Certificates: https://cabforum.org/working-groups/server/baseline-requirements/requirements/ + +### CISA +- [S72] CISA Secure by Design: https://www.cisa.gov/resources-tools/resources/secure-by-design +- [S82] CISA Zero Trust Maturity Model: https://www.cisa.gov/zero-trust-maturity-model +- [S193] CISA Known Exploited Vulnerabilities Catalog: https://www.cisa.gov/known-exploited-vulnerabilities-catalog +- [S264] CISA - Joint Guidance on Deploying AI Systems Securely: https://www.cisa.gov/news-events/alerts/2024/04/15/joint-guidance-deploying-ai-systems-securely + +### CloudEvents +- [S265] CloudEvents Specification: https://github.com/cloudevents/spec + +### Cloudflare +- [S184] Cloudflare DDoS Protection Documentation: https://developers.cloudflare.com/ddos-protection/ + +### Confluent +- [S118] Confluent - Schema Registry: https://docs.confluent.io/platform/current/schema-registry/index.html + +### Diataxis +- [S266] Diataxis Documentation Framework: https://diataxis.fr/ + +### Discord +- [S152] Discord Engineering - How Discord Stores Trillions of Messages: https://discord.com/blog/how-discord-stores-trillions-of-messages + +### DORA +- [S22] DORA DevOps Capabilities and Metrics: https://dora.dev/devops-capabilities/ + +### Envoy +- [S129] Envoy Documentation: https://www.envoyproxy.io/docs/ + +### Eric Evans +- [S109] Eric Evans - Domain-Driven Design Reference: https://www.domainlanguage.com/wp-content/uploads/2016/05/DDD_Reference_2015-03.pdf + +### Etsy +- [S141] Etsy Debriefing Facilitation Guide: https://extfiles.etsy.com/DebriefingFacilitationGuide.pdf + +### FinOps Foundation +- [S155] FinOps Framework: https://www.finops.org/framework/ + +### FIRST +- [S194] FIRST Exploit Prediction Scoring System: https://www.first.org/epss/ + +### GitHub +- [S186] GitHub Blog - gh-ost: GitHub's Online Schema Migration Tool for MySQL: https://github.blog/news-insights/company-news/gh-ost-github-s-online-migration-tool-for-mysql/ +- [S189] GitHub Docs - About Secret Scanning: https://docs.github.com/en/code-security/concepts/secret-security/about-secret-scanning + +### Google And Firebase +- [S1] Google SRE Book - Embracing Risk: https://sre.google/sre-book/embracing-risk/ +- [S2] Google SRE Book - Service Level Objectives: https://sre.google/sre-book/service-level-objectives/ +- [S3] Google SRE Book - Monitoring Distributed Systems: https://sre.google/sre-book/monitoring-distributed-systems/ +- [S4] Google SRE Book - Release Engineering: https://sre.google/sre-book/release-engineering/ +- [S5] Google SRE Book - Addressing Cascading Failures: https://sre.google/sre-book/addressing-cascading-failures/ +- [S6] Google SRE Book - Managing Incidents: https://sre.google/sre-book/managing-incidents/ +- [S7] Google SRE Book - Postmortem Culture: https://sre.google/sre-book/postmortem-culture/ +- [S8] Google SRE Book - Eliminating Toil: https://sre.google/sre-book/eliminating-toil/ +- [S9] Google SRE Book - The Production Environment at Google, from the Viewpoint of an SRE: https://sre.google/sre-book/production-environment/ +- [S10] Google SRE Workbook - Alerting on SLOs: https://sre.google/workbook/alerting-on-slos/ +- [S11] Google SRE Workbook - Canarying Releases: https://sre.google/workbook/canarying-releases/ +- [S12] Google SRE Workbook - Postmortem Culture: Learning from Failure: https://sre.google/workbook/postmortem-culture/ +- [S13] Google - Building Secure and Reliable Systems: https://google.github.io/building-secure-and-reliable-systems/raw/toc.html +- [S14] Software Engineering at Google - Testing Overview: https://abseil.io/resources/swe-book/html/ch11.html +- [S15] Software Engineering at Google - Documentation: https://abseil.io/resources/swe-book/html/ch10.html +- [S16] Software Engineering at Google - Version Control: https://abseil.io/resources/swe-book/html/ch16.html +- [S17] Software Engineering at Google - Continuous Delivery: https://abseil.io/resources/swe-book/html/ch24.html +- [S18] Software Engineering at Google - Large-Scale Changes: https://abseil.io/resources/swe-book/html/ch22.html +- [S19] Google Engineering Practices - Code Review: https://google.github.io/eng-practices/review/ +- [S20] Google Style Guides: https://google.github.io/styleguide/ +- [S21] Google Cloud - Infrastructure Reliability Guide: https://docs.cloud.google.com/architecture/infra-reliability-guide +- [S23] The Tail at Scale: https://research.google/pubs/the-tail-at-scale/ +- [S24] Large-scale Cluster Management at Google with Borg: https://research.google.com/pubs/archive/43438.pdf +- [S25] Dapper, a Large-Scale Distributed Systems Tracing Infrastructure: https://research.google/pubs/dapper-a-large-scale-distributed-systems-tracing-infrastructure/ +- [S26] Spanner: Google's Globally-Distributed Database: https://research.google.com/archive/spanner-osdi2012.pdf +- [S27] Bigtable: A Distributed Storage System for Structured Data: https://research.google.com/archive/bigtable-osdi06.pdf +- [S28] Maglev: A Fast and Reliable Software Network Load Balancer: https://research.google.com/pubs/archive/44824.pdf +- [S60] Google Cloud Blog - Introducing Kayenta, an Open Automated Canary Analysis Tool from Google and Netflix: https://cloud.google.com/blog/products/gcp/introducing-kayenta-an-open-automated-canary-analysis-tool-from-google-and-netflix +- [S66] Google Research - Autopilot: Workload Autoscaling at Google Scale: https://research.google/pubs/autopilot-workload-autoscaling-at-google-scale/ +- [S100] Google AIP-180 - Backwards Compatibility: https://google.aip.dev/180 +- [S101] Google AIP-185 - Versioning: https://google.aip.dev/185 +- [S133] Google BeyondCorp: https://research.google/pubs/beyondcorp-a-new-approach-to-enterprise-security/ +- [S170] Google - Rules of Machine Learning: https://developers.google.com/machine-learning/guides/rules-of-ml/ +- [S171] Hidden Technical Debt in Machine Learning Systems: https://papers.nips.cc/paper/5656-hidden-technical-debt-in-machine-learning-systems.pdf +- [S172] Google Research - The ML Test Score: https://research.google/pubs/the-ml-test-score-a-rubric-for-ml-production-readiness-and-technical-debt-reduction/ +- [S173] Google Cloud - MLOps: Continuous delivery and automation pipelines in machine learning: https://cloud.google.com/solutions/machine-learning/mlops-continuous-delivery-and-automation-pipelines-in-machine-learning +- [S177] Google Play Console - Release app updates with staged rollouts: https://support.google.com/googleplay/android-developer/answer/6346149 +- [S178] Firebase Crashlytics - Understand crash-free metrics: https://firebase.google.com/docs/crashlytics/crash-free-metrics +- [S180] web.dev - Web Vitals: https://web.dev/articles/vitals +- [S183] Google Cloud Armor Documentation: https://docs.cloud.google.com/armor/docs +- [S197] Google Cloud Observability - Data Processing SLIs: https://docs.cloud.google.com/stackdriver/docs/solutions/slo-monitoring/sli-metrics/data-proc-metrics +- [S199] Google SRE Workbook - Configuration Design and Best Practices: https://sre.google/workbook/configuration-design/ +- [S200] Google SRE Book - Production Services Best Practices: https://sre.google/sre-book/service-best-practices/ +- [S201] Software Engineering at Google - Deprecation: https://abseil.io/resources/swe-book/html/ch15.html +- [S202] Software Engineering at Google - Build Systems and Build Philosophy: https://abseil.io/resources/swe-book/html/ch18.html +- [S209] Google Privacy & Terms - How Google Retains Data We Collect: https://policies.google.com/technologies/retention +- [S267] Google Research - Overlapping Experiment Infrastructure: https://research.google.com/pubs/archive/36500.pdf +- [S268] Google Cloud - Runtime Lifecycle: https://cloud.google.com/appengine/docs/standard/lifecycle/runtime-lifecycle +- [S260] Android Developers - Test Your App's Accessibility: https://developer.android.com/guide/topics/ui/accessibility/testing + +### Grafana +- [S125] Grafana Documentation: https://grafana.com/docs/ +- [S144] Grafana - The RED Method: https://grafana.com/blog/the-red-method-how-to-instrument-your-services/ + +### Great Expectations +- [S198] Great Expectations - Validate Data: https://docs.greatexpectations.io/docs/guides/validation/validate_data_overview/ + +### HashiCorp +- [S121] Terraform Documentation: https://developer.hashicorp.com/terraform/docs + +### Honeycomb +- [S145] Honeycomb - Observability 2.0: https://www.honeycomb.io/blog/one-key-difference-observability1dot0-2dot0 + +### IETF +- [S84] OAuth 2.1 Draft: https://datatracker.ietf.org/doc/html/draft-ietf-oauth-v2-1 +- [S85] RFC 8446 - TLS 1.3: https://datatracker.ietf.org/doc/html/rfc8446 +- [S104] RFC 9457 - Problem Details for HTTP APIs: https://www.rfc-editor.org/rfc/rfc9457.html +- [S269] RFC 7696 - Guidelines for Cryptographic Algorithm Agility: https://www.rfc-editor.org/rfc/rfc7696 + +### Industrial Empathy +- [S159] Industrial Empathy - Design Docs at Google: https://www.industrialempathy.com/posts/design-docs-at-google/ + +### Istio +- [S195] Istio Traffic Management: https://istio.io/latest/docs/concepts/traffic-management/ + +### Jay Kreps +- [S153] Jay Kreps - The Log: https://engineering.linkedin.com/distributed-systems/log-what-every-software-engineer-should-know-about-real-time-datas-unifying + +### John D. C. Little +- [S157] Little - A Proof for the Queuing Formula L = lambda W: https://pubsonline.informs.org/doi/10.1287/opre.9.3.383 + +### JSON Schema +- [S270] JSON Schema Specification: https://json-schema.org/specification + +### Kubernetes +- [S120] Kubernetes Documentation: https://kubernetes.io/docs/home/ +- [S196] Kubernetes Gateway API: https://kubernetes.io/docs/concepts/services-networking/gateway/ +- [S271] Kubernetes Version Skew Policy: https://kubernetes.io/releases/version-skew-policy + +### Martin Fowler +- [S108] Martin Fowler - What do you mean by Event-Driven?: https://martinfowler.com/articles/201701-event-driven.html +- [S111] Martin Fowler - Bounded Context: https://martinfowler.com/bliki/BoundedContext.html +- [S114] Martin Fowler - MonolithFirst: https://martinfowler.com/bliki/MonolithFirst.html +- [S115] Martin Fowler - Feature Toggles: https://martinfowler.com/articles/feature-toggles.html +- [S160] Martin Fowler - The Practical Test Pyramid: https://martinfowler.com/articles/practical-test-pyramid.html +- [S161] Martin Fowler - Circuit Breaker: https://martinfowler.com/bliki/CircuitBreaker.html +- [S162] Martin Fowler - Microservice Premium: https://martinfowler.com/bliki/MicroservicePremium.html +- [S163] Martin Fowler - CanaryRelease: https://martinfowler.com/bliki/CanaryRelease.html + +### Martin Kleppmann +- [S148] Designing Data-Intensive Applications: https://dataintensive.net/ + +### Meta +- [S50] Meta Engineering - Move Faster, Wait Less: Improving Code Review Time at Meta: https://engineering.fb.com/2022/11/16/culture/meta-code-review-time-improving/ +- [S51] Meta Engineering - Open-sourcing Facebook Infer: https://engineering.fb.com/developer-tools/open-sourcing-facebook-infer-identify-bugs-before-you-ship/ +- [S52] Meta Engineering - Sapienz: Intelligent Automated Software Testing at Scale: https://engineering.fb.com/developer-tools/sapienz-intelligent-automated-software-testing-at-scale/ +- [S53] Meta Engineering - TAO: The Power of the Graph: https://engineering.fb.com/2013/06/25/core-infra/tao-the-power-of-the-graph/ +- [S54] Meta Engineering - Scaling Memcache at Facebook: https://engineering.fb.com/2013/04/15/core-infra/scaling-memcache-at-facebook/ +- [S55] Meta Engineering - Cache Made Consistent: https://engineering.fb.com/2022/06/08/core-infra/cache-made-consistent/ +- [S56] Meta Engineering - Update About the October 4th Outage: https://engineering.fb.com/2021/10/04/networking-traffic/outage/ +- [S58] Meta Engineering - Automating Dead Code Cleanup: https://engineering.fb.com/2023/10/24/data-infrastructure/automating-dead-code-cleanup/ +- [S205] Meta Engineering - Automating Product Deprecation: https://engineering.fb.com/2023/10/17/data-infrastructure/automating-product-deprecation-meta/ +- [S206] Meta Engineering - Automating Data Removal: https://engineering.fb.com/2023/10/31/data-infrastructure/automating-data-removal/ +- [S207] Meta Engineering - DELF: Safeguarding Deletion Correctness: https://engineering.fb.com/2020/08/12/security/delf/ +- [S208] Meta Engineering - Privacy Aware Infrastructure Purpose Limitation: https://engineering.fb.com/2024/08/27/security/privacy-aware-infrastructure-purpose-limitation-meta/ +- [S211] Meta Engineering - How Meta Understands Data at Scale: https://engineering.fb.com/2025/04/28/security/how-meta-understands-data-at-scale/ + +### Michael Nygard +- [S191] Michael Nygard - Documenting Architecture Decisions: https://cognitect.com/blog/2011/11/15/documenting-architecture-decisions + +### Microservices.io +- [S116] Microservices.io - Transactional Outbox: https://microservices.io/patterns/data/transactional-outbox.html +- [S117] Microservices.io - Saga: https://microservices.io/patterns/data/saga.html + +### Microsoft And Azure +- [S90] Azure Well-Architected Framework: https://learn.microsoft.com/en-us/azure/well-architected/ +- [S91] Azure Well-Architected - Mission-Critical Design Principles: https://learn.microsoft.com/en-us/azure/well-architected/mission-critical/mission-critical-design-principles +- [S92] Microsoft Security Development Lifecycle: https://learn.microsoft.com/en-us/compliance/assurance/assurance-microsoft-security-development-lifecycle +- [S93] Microsoft Learn - Integrating Threat Modeling with DevOps: https://learn.microsoft.com/en-us/security/engineering/threat-modeling-with-dev-ops +- [S94] Azure Architecture Center - Retry Pattern: https://learn.microsoft.com/en-us/azure/architecture/patterns/retry +- [S95] Azure Architecture Center - Circuit Breaker Pattern: https://learn.microsoft.com/en-us/azure/architecture/patterns/circuit-breaker +- [S96] Azure Architecture Center - Bulkhead Pattern: https://learn.microsoft.com/en-us/azure/architecture/patterns/bulkhead +- [S214] Azure Well-Architected - Reliability Checklist: https://learn.microsoft.com/en-us/azure/well-architected/reliability/checklist +- [S215] Azure Well-Architected - Safe Deployment Practices: https://learn.microsoft.com/en-us/azure/well-architected/operational-excellence/safe-deployments +- [S216] Azure Well-Architected - Incident Management Process: https://learn.microsoft.com/en-us/azure/well-architected/operational-excellence/mitigation-strategy +- [S217] Azure Well-Architected - Performance Efficiency Checklist: https://learn.microsoft.com/en-us/azure/well-architected/performance-efficiency/checklist +- [S218] Azure Well-Architected - Performance Testing Strategies: https://learn.microsoft.com/en-us/azure/well-architected/performance-efficiency/performance-test +- [S219] Azure Well-Architected - Cost Optimization Tradeoffs: https://learn.microsoft.com/en-us/azure/well-architected/cost-optimization/tradeoffs +- [S220] Azure Architecture Center - Deployment Stamps Pattern: https://learn.microsoft.com/en-us/azure/architecture/patterns/deployment-stamp +- [S221] Azure Well-Architected - Availability Zones and Regions: https://learn.microsoft.com/en-us/azure/well-architected/reliability/regions-availability-zones +- [S222] Azure Architecture Center - Rate Limiting Pattern: https://learn.microsoft.com/en-us/azure/architecture/patterns/rate-limiting-pattern +- [S223] Azure Architecture Center - Queue-Based Load Leveling Pattern: https://learn.microsoft.com/en-us/azure/architecture/patterns/queue-based-load-leveling +- [S224] Microsoft DevOps - How Microsoft Develops with DevOps: https://learn.microsoft.com/en-us/devops/develop/how-microsoft-develops-devops +- [S225] Microsoft DevOps - How Microsoft Delivers Software with DevOps: https://learn.microsoft.com/en-us/devops/deliver/how-microsoft-delivers-devops +- [S226] Microsoft DevOps - How Microsoft Operates Reliable Systems with DevOps: https://learn.microsoft.com/en-us/devops/operate/how-microsoft-operates-devops +- [S227] Microsoft DevOps - Shift Testing Left with Unit Tests: https://learn.microsoft.com/en-us/devops/develop/shift-left-make-testing-fast-reliable +- [S228] Microsoft DevOps - Continuous Delivery: https://learn.microsoft.com/en-us/devops/deliver/what-is-continuous-delivery +- [S229] Microsoft Platform Engineering - Self-Service with Guardrails: https://learn.microsoft.com/en-us/platform-engineering/about/self-service +- [S230] Microsoft Cloud Adoption Framework - Azure Landing Zones: https://learn.microsoft.com/en-us/azure/cloud-adoption-framework/ready/landing-zone/ +- [S231] Microsoft Entra - Configure Zero Trust to Protect Identities and Secrets: https://learn.microsoft.com/en-us/entra/fundamentals/zero-trust-protect-identities +- [S232] Microsoft Entra - Workload Identities: https://learn.microsoft.com/en-us/entra/workload-id/workload-identities-overview +- [S233] Microsoft Cloud Security Benchmark v2 Preview - DevOps Security: https://learn.microsoft.com/en-us/security/benchmark/azure/mcsb-v2-devop-security +- [S234] Microsoft Secure Future Initiative - Protect the Software Supply Chain: https://learn.microsoft.com/en-us/security/zero-trust/sfi/protect-software-supply-chain +- [S235] Azure DDoS Protection - Fundamental Best Practices: https://learn.microsoft.com/en-us/azure/ddos-protection/fundamental-best-practices +- [S236] Azure Architecture Center - API Design: https://learn.microsoft.com/en-us/azure/architecture/microservices/design/api-design +- [S237] Azure Architecture Center - Data Partitioning Strategies: https://learn.microsoft.com/en-us/azure/architecture/best-practices/data-partitioning-strategies +- [S238] Azure Architecture Center - Cache-Aside Pattern: https://learn.microsoft.com/en-us/azure/architecture/patterns/cache-aside +- [S239] Azure Architecture Center - CQRS Pattern: https://learn.microsoft.com/en-us/azure/architecture/patterns/cqrs +- [S240] Azure Architecture Center - Event Sourcing Pattern: https://learn.microsoft.com/en-us/azure/architecture/patterns/event-sourcing +- [S241] Microsoft Trust Center - Privacy: https://www.microsoft.com/en-us/trust-center/privacy +- [S242] Microsoft Purview - Data Lifecycle Management: https://learn.microsoft.com/en-us/purview/data-lifecycle-management +- [S243] Azure Well-Architected - Security Checklist: https://learn.microsoft.com/en-us/azure/well-architected/security/checklist +- [S244] Azure Well-Architected - Threat Analysis Strategies: https://learn.microsoft.com/en-us/azure/architecture/framework/security/design-threat-model +- [S245] Azure Well-Architected - Build a Monitoring System: https://learn.microsoft.com/en-us/azure/well-architected/design-guides/monitoring +- [S246] Azure Reliability - Business Continuity, High Availability, and Disaster Recovery: https://learn.microsoft.com/en-us/azure/reliability/disaster-recovery-overview +- [S247] Azure Chaos Studio - Chaos Engineering and Resilience: https://learn.microsoft.com/en-us/azure/chaos-studio/chaos-studio-chaos-engineering-overview +- [S248] Azure Well-Architected - Mission-Critical Health Modeling: https://learn.microsoft.com/en-us/azure/well-architected/mission-critical/mission-critical-health-modeling +- [S249] Azure Well-Architected - Health Modeling for Workloads: https://learn.microsoft.com/en-us/azure/well-architected/design-guides/health-modeling +- [S250] Microsoft Entra - Managed Identities for Azure Resources: https://learn.microsoft.com/en-us/entra/identity/managed-identities-azure-resources/overview +- [S251] Microsoft Entra - Workload Identity Federation: https://learn.microsoft.com/en-us/entra/workload-id/workload-identity-federation +- [S252] Microsoft Platform Engineering - Platform Engineering Capability Model: https://learn.microsoft.com/en-us/platform-engineering/platform-engineering-capability-model +- [S253] Microsoft AI Red Team: https://learn.microsoft.com/en-us/security/ai-red-team/ +- [S254] Microsoft Security Engineering - Threat Modeling AI/ML Systems and Dependencies: https://learn.microsoft.com/en-us/security/engineering/threat-modeling-aiml +- [S255] Microsoft Security Engineering - Failure Modes in Machine Learning: https://learn.microsoft.com/en-us/security/engineering/failure-modes-in-machine-learning +- [S256] Azure Well-Architected - Security Incident Response: https://learn.microsoft.com/en-us/azure/well-architected/security/incident-response +- [S257] Azure Architecture Center - Tenancy Models for a Multitenant Solution: https://learn.microsoft.com/en-us/azure/architecture/guide/multitenant/considerations/tenancy-models +- [S258] Microsoft Cloud Security Benchmark v2 Preview - Overview: https://learn.microsoft.com/en-us/security/benchmark/azure/overview +- [S272] Microsoft Research - Diagnosing Sample Ratio Mismatch in Online Controlled Experiments: https://www.microsoft.com/en-us/research/publication/diagnosing-sample-ratio-mismatch-in-online-controlled-experiments-a-taxonomy-and-rules-of-thumb-for-practitioners/ + +### MITRE +- [S87] MITRE ATT&CK: https://attack.mitre.org/ + +### Netflix +- [S62] A Platform for Automating Chaos Experiments: https://arxiv.org/abs/1702.05849 +- [S63] Netflix Chaos Monkey Documentation: https://netflix.github.io/chaosmonkey/ +- [S64] Netflix DGS Framework - Federation: https://netflix.github.io/dgs/federation/ +- [S65] Netflix Repokid: https://github.com/Netflix/repokid +- [S212] Netflix - Automating Chaos Experiments in Production: https://arxiv.org/abs/1905.04648 + +### NIST +- [S70] NIST SP 800-218 - Secure Software Development Framework: https://csrc.nist.gov/Projects/ssdf +- [S81] NIST SP 800-207 - Zero Trust Architecture: https://csrc.nist.gov/pubs/sp/800/207/final +- [S86] NIST Post-Quantum Cryptography Project: https://csrc.nist.gov/projects/post-quantum-cryptography +- [S192] NIST FIPS 203 - Module-Lattice-Based Key-Encapsulation Mechanism Standard: https://csrc.nist.gov/pubs/fips/203/final +- [S203] NIST Privacy Framework: https://www.nist.gov/privacy-framework/privacy-framework +- [S204] NIST SP 800-53 Revision 5 - Security and Privacy Controls: https://csrc.nist.gov/pubs/sp/800/53/r5/upd1/final +- [S273] NIST SP 800-128 - Security-Focused Configuration Management: https://csrc.nist.gov/publications/detail/sp/800-128/final +- [S274] NIST AI Risk Management Framework 1.0: https://www.nist.gov/publications/artificial-intelligence-risk-management-framework-ai-rmf-10 +- [S275] NIST AI 600-1 - Generative AI Profile: https://www.nist.gov/publications/artificial-intelligence-risk-management-framework-generative-artificial-intelligence +- [S276] NIST SP 800-57 Part 1 Revision 5 - Recommendation for Key Management: https://csrc.nist.gov/pubs/sp/800/57/pt1/r5/final +- [S277] NIST SP 800-131A Revision 2 - Transitioning Cryptographic Algorithms and Key Lengths: https://csrc.nist.gov/pubs/sp/800/131/a/r2/final + +### OpenAI +- [S278] OpenAI API - Agent Evals: https://platform.openai.com/docs/guides/agent-evals + +### OpenAPI Initiative +- [S279] OpenAPI Specification: https://spec.openapis.org/oas/ + +### Open Policy Agent +- [S126] Open Policy Agent Documentation: https://www.openpolicyagent.org/docs/latest/ + +### OpenSSF +- [S78] OpenSSF Scorecard: https://github.com/ossf/scorecard +- [S79] OpenSSF Open Source Project Security Baseline: https://baseline.openssf.org/ +- [S280] OpenSSF - Security-Focused Guide for AI Code Assistant Instructions: https://best.openssf.org/Security-Focused-Guide-for-AI-Code-Assistant-Instructions +- [S281] OpenSSF - AI/ML Security Working Group: https://openssf.org/technical-initiatives/ai-ml-security/ + +### OpenTelemetry +- [S89] OpenTelemetry Documentation: https://opentelemetry.io/docs/what-is-opentelemetry/ +- [S128] OpenTelemetry Collector Documentation: https://opentelemetry.io/docs/collector/ + +### OWASP +- [S73] OWASP Application Security Verification Standard: https://owasp.org/www-project-application-security-verification-standard/ +- [S74] OWASP Top 10: https://owasp.org/Top10/ +- [S75] OWASP Cheat Sheet Series: https://cheatsheetseries.owasp.org/ +- [S175] OWASP Top 10 for LLM Applications 2025: https://genai.owasp.org/resource/owasp-top-10-for-llm-applications-2025/ + +### PagerDuty +- [S140] PagerDuty Incident Response: https://response.pagerduty.com/ + +### Perfdynamics +- [S156] Universal Scalability Law: https://www.perfdynamics.com/Manifesto/USLscalability.html + +### PostgreSQL +- [S187] PostgreSQL Documentation - Routine Vacuuming: https://www.postgresql.org/docs/current/routine-vacuuming.html + +### Principles Of Chaos Engineering +- [S61] Principles of Chaos Engineering: https://principlesofchaos.org/ + +### Prometheus +- [S124] Prometheus Documentation: https://prometheus.io/docs/introduction/overview/ + +### Richard Cook +- [S142] Richard Cook - How Complex Systems Fail: https://how.complexsystems.fail/ + +### Semantic Versioning +- [S282] Semantic Versioning Specification: https://semver.org/ + +### Shopify +- [S112] Shopify Engineering - Deconstructing the Monolith: https://shopify.engineering/deconstructing-monolith-designing-software-maximizes-developer-productivity + +### Sigstore +- [S80] Sigstore Documentation: https://docs.sigstore.dev/about/overview/ +- [S127] Cosign Documentation: https://docs.sigstore.dev/cosign/signing/overview/ + +### SLSA +- [S76] SLSA Framework: https://slsa.dev/spec/ +- [S77] SLSA Build Provenance Specification: https://slsa.dev/spec/v1.2/build-provenance + +### SPIFFE/SPIRE +- [S83] SPIFFE/SPIRE: https://spiffe.io/ + +### Stripe +- [S102] Stripe - Designing Robust and Predictable APIs with Idempotency: https://stripe.com/blog/idempotency +- [S103] Stripe - API Versioning: https://stripe.com/blog/api-versioning +- [S185] Stripe - Online Migrations at Scale: https://stripe.com/blog/online-migrations + +### The Twelve-Factor App +- [S147] Twelve-Factor App - Config: https://12factor.net/config + +### Trunk Based Development +- [S146] Trunk Based Development: https://trunkbaseddevelopment.com/ + +### Uber +- [S113] Uber Engineering - DOMA: https://www.uber.com/us/en/blog/microservice-architecture/ + +### Vitess +- [S188] Vitess Documentation - Managed, Online Schema Changes: https://vitess.io/docs/24.0/user-guides/schema-changes/managed-online-schema-changes/ + +### W3C +- [S88] W3C Trace Context: https://www.w3.org/TR/trace-context/ +- [S181] W3C - Web Content Accessibility Guidelines 2.2: https://www.w3.org/TR/WCAG22/ +- [S283] W3C - Accessibility Conformance Testing Rules Format: https://www.w3.org/TR/act-rules-format/ + +### Werner Vogels +- [S48] Werner Vogels - Eventually Consistent: https://www.allthingsdistributed.com/2008/12/eventually_consistent.html diff --git a/plugins/sirmarkz/staff-engineer-mode/skills/_shared/references/synthesis-matrix.md b/plugins/sirmarkz/staff-engineer-mode/skills/_shared/references/synthesis-matrix.md new file mode 100644 index 00000000..7a99b6c2 --- /dev/null +++ b/plugins/sirmarkz/staff-engineer-mode/skills/_shared/references/synthesis-matrix.md @@ -0,0 +1,23 @@ +# Staff Engineer Mode Synthesis Matrix + +This file records the normalized defaults used by the hand-authored skills. It is a navigation aid, not generated source of truth. The `SKILL.md` files carry the authoritative instructions. + +| Theme | Normalized Default | +| --- | --- | +| Routing | Select one primary skill by engineering surface, event type, risk, and scope. Ask one question when confidence is low. | +| Architecture and interfaces | Prefer modular boundaries, explicit contracts, and ADRs before adding distributed complexity. | +| Reliability and resilience | Define user-visible reliability, bound failure domains, control dependency amplification, model tail latency, validate correctness properties, and prove recovery with evidence. | +| Delivery and quality | Make builds, config, automation, docs, fleet upgrades, and changes gradual, observable, reversible, tested, reviewed, and migrated with explicit evidence. | +| Operations and observability | Page only on urgent actionable user impact; use telemetry to explain impact and causality. | +| Data and workflows | Start from data semantics and contracts, then choose consistency, workflow, cache, database, pipeline, and ML controls. | +| Security and privacy | Map trust boundaries, cryptographic lifecycle, and data lifecycles to enforceable controls, least privilege, minimization, evidence, and verification. | +| Platform and infrastructure | Encode standards as reusable capabilities with desired state, policy, drift control, and operational responsibility. | +| Client and edge experience | Gate client releases on user-visible runtime quality, segmented telemetry, and rollback or forward-fix paths. | +| AI and experimentation | Gate AI-assisted development, model-backed workflows, and experiments with scoped authority, representative evaluation, metric trust, and reviewable evidence. | + +## Cross-Cutting Rules + +- Skills must stay technology-agnostic unless explicitly tied to a domain such as frontend, mobile, ML, or LLM applications. +- Vendor and tool references may appear as sources, but defaults must be expressed as capabilities and evidence. +- Competing source practices should be blended into one pragmatic default with explicit exceptions. +- Missing evidence is a blocker, exception, or follow-up route, not an acceptable claim. diff --git a/plugins/sirmarkz/staff-engineer-mode/skills/staff-engineer-mode/SKILL.md b/plugins/sirmarkz/staff-engineer-mode/skills/staff-engineer-mode/SKILL.md new file mode 100644 index 00000000..e389cddf --- /dev/null +++ b/plugins/sirmarkz/staff-engineer-mode/skills/staff-engineer-mode/SKILL.md @@ -0,0 +1,182 @@ +--- +name: staff-engineer-mode +description: "Use when making engineering decisions across ideation, design, development, testing, release, operations, or maintenance" +--- + +# Staff Engineer Mode + +## Iron Law + +``` +ONE PRIMARY SPECIALIST BY DEFAULT; INFER ROUTING CONTEXT BEFORE WITHHOLDING +``` + +Loading many plausible specialists is a routing failure. + +## Precedence Over Generic Process Packs + +When the request touches an engineering surface -- architecture, reliability, resilience, operations, security, delivery, data, platform, client, AI/ML, accessibility, cost, production-readiness, rollout, migration, incident response, control records, API design, service contracts, or design of any engineering system -- **Staff Engineer Mode runs first**. + +Do not invoke `superpowers:brainstorming`, `superpowers:writing-plans`, or any other broad process / design skill as the first response to an engineering-system prompt. Route through Staff Engineer Mode and load the selected specialist via the Load Contract below. A process skill may be used only after the specialist is loaded, and only for sub-decisions inside that specialist's workflow. + +Phrasings such as "build X", "design X", "make X reliable", "add HA to X", "plan a rollout", "review this service", "prep for launch", or "investigate this incident" -- where X is an engineering system -- ARE engineering-system prompts. Route them through Staff Engineer Mode, not through generic brainstorming. The user prompt does not need to name lifecycle phases or specialist slugs. + +## Load Contract + +To load a specialist, **Read** the file at `/.md`. Resolve `` in this order: + +1. If a `SPECIALIST_ROOT=` line is present in this session's additional context (Claude Code, Cursor, OpenCode), use that absolute path. +2. Otherwise use the platform default: + - Codex: `~/.codex/staff-engineer-mode/specialists` + - Gemini: the `specialists` directory next to the loaded `GEMINI.md` + - Any other host: the `specialists/` directory at the router skill's install root. + +Three rules, all mandatory: + +- **Use the Read tool. Do not use the Skill tool.** Specialists are not registered skills on any supported platform. `Skill staff-engineer-mode:` returns `Unknown skill` and is a routing failure. +- **Complete the Read before producing engineering guidance for routed work.** Do not answer routed engineering prompts from priors. +- **A confidently-routed answer without a matching Read in the same turn is a routing failure even when the slug is correct.** + +## Overview + +Users are not expected to know specialist names. Classify by artifact, phase, surface, and risk, then quietly select the specialist whose outputs fit the next useful artifact. + +## When To Use + +- The request asks for engineering decisions or guidance for design, delivery, operations, reliability, security, architecture, API, data, platform, or client work. +- The user asks to guide ideation, design, development, testing, release, or maintenance decisions. +- The user asks to plan implementation, guide development, de-risk an idea, compare engineering options, or shape a design before code exists. +- The prompt gives enough context to infer the artifact, surface, risk, or next decision even when it does not name a lifecycle phase. +- The request is broad, vague, or spans multiple engineering surfaces. +- No single specialist clearly dominates from the prompt. +- The user asks for staff-engineer-level architecture, reliability, security, operations, delivery, data, platform, client, or cost guidance. +- The user asks to troubleshoot an unclear network, deployment, reliability, performance, security, data, or operations issue. + +## When Not To Use + +- A focused specialist has already been selected and loaded for the current request. +- The request is product discovery, marketing, staffing, compensation, procurement, legal/auditor liaison, broad compliance program management, or business strategy. +- The request is routine editorial or mechanical single-file documentation cleanup with no source-of-truth, freshness, operational, or lifecycle decision. +- The work is outside system delivery, operations, security, reliability, or maintainability. + +## Inputs To Infer + +Infer these from the prompt, repo, files, branch context, and conversation. Do not ask the user to supply them as intake fields. + +- **Artifact:** decision, design, plan, readiness check, rollout, investigation, runbook, migration, eval, control pack, or diff review. +- **Phase:** ideation, design, development, testing, before merge, release, migration, active incident, post-incident, regression, readiness, or maintenance. +- **Surface:** architecture, contract, reliability target, topology, dependency, performance, observability, delivery, data, platform, security, client, AI, accessibility, cost, or operator load. +- **Risk/scope:** availability, latency, durability, correctness, privacy/security, compatibility, release safety, tenant/customer impact, public edge, internal traffic, multi-service, or multi-location. + +## Bundled Specialist Slugs + +Pick `primary` and `secondary` only from this exact list. Never invent, shorten, or paraphrase a slug. + +``` +accessibility-gates, agent-pr-review, ai-coding-governance, api-design-and-compatibility, +architecture-decisions, backup-and-recovery, caching-and-derived-data, +code-readability-for-agents, configuration-and-automation-safety, +cost-aware-reliability, cryptography-and-key-lifecycle, database-operations, data-contracts, +data-pipeline-reliability, dependency-and-code-hygiene, dependency-resilience, +dev-environment-parity, distributed-data-and-consistency, documentation-lifecycle, +edge-traffic-and-ddos-defense, engineering-control-evidence, event-workflows, +experimentation-and-metric-guardrails, feature-flag-lifecycle, fleet-upgrades, +high-availability-design, identity-and-secrets, incident-response-and-postmortems, +infrastructure-and-policy-as-code, internal-service-networking, llm-application-security, +llm-evaluation, llm-serving-cost-and-latency, migration-and-deprecation, +ml-reliability-and-evaluation, mobile-release-engineering, observability-and-alerting, +oncall-health, performance-and-capacity, platform-golden-paths, privacy-and-data-lifecycle, +production-readiness-review, progressive-delivery, release-build-reproducibility, +resilience-experiments, secure-sdlc-and-threat-modeling, slo-and-error-budgets, +software-supply-chain-security, state-machine-correctness, tenant-isolation, +test-data-engineering, testing-and-quality-gates, vulnerability-management, +web-release-gates +``` + +## Workflow + +1. Infer the requested artifact and phase from prompt, repo, files, branch context, and conversation before naming any skill. +2. If the work is in ideation, design, development, testing, release, or maintenance and has an engineering surface, route by the decision or artifact the specialist should guide; concrete files, diffs, and repo artifacts improve the answer, and are required only for explicitly diff-specific review. +3. Treat phase labels as signals, not hard requirements; infer applicability from context, artifact, surface, risk, and the next decision. +4. Translate named tools into capabilities; routing outputs must use capability language, not repeat tool, vendor, framework, protocol, database, or command names from the prompt. +5. Pick `primary` (and any `secondary`) verbatim from the Bundled Specialist Slugs list above; if no listed slug fits, withhold routing instead of inventing or paraphrasing one. +6. Choose the narrowest primary whose required outputs match the next artifact. +7. Add one secondary only when the user explicitly asks for a separate artifact covered by another skill. +8. Load the chosen specialist per the Load Contract above before producing detailed guidance. +9. If confidence is low, infer the safest narrow in-scope route from available context; withhold routing only when no engineering lifecycle/control frame is present. +10. Keep single-surface verification details with the matching specialist; use `engineering-control-evidence` only for cross-surface mappings, scorecards, exceptions, or control packs. +11. Reframe out-of-scope work as an engineering-control question only when that is plausible. + +## Synthesized Default + +Select one primary when the prompt has enough context. Recommend at most one secondary follow-up. Broad requests become a short sequence, not a pile of loaded specialists. + +## Exceptions + +- For explicit launch/readiness decisions or broad release readiness checks, use `production-readiness-review` as primary. +- For active incidents, use `incident-response-and-postmortems` first even if root cause appears to belong elsewhere. +- For vague prompts such as "make this better" or "troubleshoot a network issue", infer from repo and conversation context before withholding routing. +- For out-of-scope business or ceremony prompts, do not select a skill unless context already supplies an engineering lifecycle/control framing. + +## Review Routing + +Treat "review" as a verb until the artifact proves otherwise. + +- Concrete PR, branch, patch, last commit, or diff review before merge routes to `agent-pr-review`. +- Changed files alone do not make a diff review; route static-analysis or maintenance backlog prioritization to `dependency-and-code-hygiene`. +- Generic review-system design, reviewer routing, ownership, change size, review latency, or DORA workflow has no routed specialist unless a concrete engineering surface is present. +- Launch readiness, go/no-go, tier upgrade, or broad release readiness routes to `production-readiness-review`. +- Design review, architecture review, security review, API review, data review, rollout review, or test review without a concrete diff routes by the engineering surface, not by the word "review". +- A surface-specific change before merge still routes to the narrow surface specialist when the requested artifact is compatibility, deprecation, migration, safety, rollout, security, accessibility, data, or test results rather than a general diff verdict. + +## Required Outputs + +- For confident routing: primary specialist slug; optional secondary only when necessary; confidence of high or medium. +- Inferred intent: requested artifact, dominant surface, work phase, and one-sentence rationale. +- For explicit eval-harness runs only: include a fenced `routing` block only for confident in-scope routing; never emit a routing block for low-confidence, ambiguous, or out-of-scope prompts. The block contains a JSON object with `primary`, `secondary`, `confidence`, `artifact`, `surface`, `phase`, and `rationale`; JSON text fields must not repeat tool, vendor, framework, protocol, database, or command names from the prompt. +- For low-confidence routing: infer a best-effort route when in scope; otherwise withhold routing without intake questions, candidate lists, confidence labels, routing drafts, or specialist names. +- Out-of-scope reframe when applicable, without specialist names or candidate routes. + +## Checks Before Moving On + +- `single_primary`: output has exactly one primary specialist unless routing is withheld. +- `secondary_cap`: output has no more than one secondary specialist. +- `capability_translation`: tool, vendor, or framework names are translated into capability language before routing and not repeated in routing block fields. +- `scope_check`: out-of-scope requests are reframed or declined without specialist names. +- `ambiguity_check`: ambiguous prompts infer from available context when possible; withheld routes expose no specialist names, candidate routes, confidence labels, drafts, or intake questions. +- `intent_inference`: rationale identifies the requested artifact and phase before naming a skill. + +## Routing Tiebreakers + +Use this section for common routing precedence. Load `references/routing-matrix.md` for exact-slug guardrails, eval runs, exact-slug uncertainty, or adjacent surfaces. + +- Explicit launch, major traffic shift, tier upgrade, or readiness decision routes to `production-readiness-review`; active user-impacting incidents route to `incident-response-and-postmortems` before root-cause specialty work. +- Prefer newer narrow routes over broad neighbors. Concrete PR, branch, patch, or diff review routes to `agent-pr-review` even when test results are mentioned; otherwise route the engineering decision to the narrow surface specialist. +- Reliability policy, telemetry construction, on-call load, fault-domain topology/static failover capacity, restore capability, failure experiments, overload controls, and state invariants are separate surfaces. +- API compatibility, data contracts, migrations, hygiene, fleet upgrades, event replay/DLQ, database backfills, cross-service database/storage correctness, cache freshness, and pipeline freshness stay distinct. +- Database migration execution is `database-operations`; if the same prompt separately asks for future blocking checks, add `testing-and-quality-gates` as secondary. +- Build/release artifacts, production exposure, rollback plans, config or automation mutation, and feature-flag lifecycle are separate delivery artifacts. +- Desired-state capture, drift detection, reconciliation, or emergency exception rules after manual infrastructure changes route to `infrastructure-and-policy-as-code`. +- Deprecation PRs/no-new-usage checks stay with `migration-and-deprecation`; ML promotion/eval/skew/drift/rollback stays with `ml-reliability-and-evaluation`. +- Security routes by artifact: threat model, identity/secrets, cryptography, supply-chain trust, deployed vulnerability, tenant boundary, privacy lifecycle, or LLM app risk. +- Public edge defense, service identity/discovery/locality, dependency retry/timeout/circuit-breaker policy, backend capacity, browser field/lab release signals, accessibility, cost tradeoffs, LLM eval/serving/security, AI coding controls, and code readability stay separate. +- Single-surface verification details stay with the matching specialist; cross-surface control mappings, scorecards, exception records, and control packs route to `engineering-control-evidence`. + +## Red Flags - Stop And Rework + +- More than two specialists are selected automatically. +- The router chooses from a phrase match without identifying artifact and phase. +- A tool or vendor name drives routing without capability translation, or appears in routing block text. +- `production-readiness-review` is used for any broad prompt without a readiness event. +- Compliance, staffing, compensation, procurement, or marketing work is routed as engineering work. +- A low-confidence or out-of-scope answer names candidate specialists, prints a routing draft, or exposes the internal shortlist. + +## Common Mistakes + +| Mistake | Correction | +| --- | --- | +| Keyword matching | Infer artifact, phase, surface, and risk. | +| Loading every related specialist | Choose one primary and list at most one follow-up. | +| Treating tools as domains | Translate tools to capabilities. | +| Dumping candidate specialists | Infer the narrowest route, or withhold only when no in-scope frame exists. | +| Asking intake questions too soon | Infer from prompt, repo, files, branch context, and conversation first. | diff --git a/plugins/sirmarkz/staff-engineer-mode/skills/staff-engineer-mode/references/bootstrap-context.md b/plugins/sirmarkz/staff-engineer-mode/skills/staff-engineer-mode/references/bootstrap-context.md new file mode 100644 index 00000000..227d840b --- /dev/null +++ b/plugins/sirmarkz/staff-engineer-mode/skills/staff-engineer-mode/references/bootstrap-context.md @@ -0,0 +1,22 @@ +SPECIALIST_ROOT={{SPECIALIST_ROOT}} +LOAD: Read ${SPECIALIST_ROOT}/.md before producing engineering guidance for routed work. Never call the Skill tool on a specialist slug -- specialists are files, not registered skills. + + +Primary entrypoint for engineering-system requests is Staff Engineer Mode. This overrides any generic process pack on these surfaces: architecture, reliability, resilience, operations, security, delivery, data, platform, client, AI/ML, accessibility, cost, production-readiness, rollout, migration, incident, control records, API design, service contracts, and design of engineering systems. + +Do not invoke a broad design/process skill as the first response to an engineering-system prompt. First route through Staff Engineer Mode and Read the selected specialist file. Only after the specialist is loaded may you optionally use a process skill for sub-decisions inside that specialist's workflow. + +"Build X", "design X", "make X reliable", "add HA to X", "plan a rollout", "review this service", "prep for launch", "investigate this incident" -- when X is an engineering system -- ARE engineering-system prompts. Route them through Staff Engineer Mode, not through generic brainstorming. + + + +You have staff-engineer-mode. + +Users are not expected to know or invoke individual Staff Engineer Mode specialist names. For engineering lifecycle, DevOps, operations, reliability, resilience, security, architecture, data, platform, client, and cost-aware reliability requests, apply the router instructions below. After routing, read only the selected specialist reference file from `${SPECIALIST_ROOT}/.md` before giving detailed guidance. + +Keep guidance technology-agnostic by default. Do not introduce cloud providers, frameworks, databases, monitoring products, protocols, or command examples unless the user supplied them or explicitly asks for tool-specific guidance. + +{{ROUTER_CONTENT}} + +{{TOOL_MAPPING}} + diff --git a/plugins/sirmarkz/staff-engineer-mode/skills/staff-engineer-mode/references/router-eval-set.yaml b/plugins/sirmarkz/staff-engineer-mode/skills/staff-engineer-mode/references/router-eval-set.yaml new file mode 100644 index 00000000..2e71b860 --- /dev/null +++ b/plugins/sirmarkz/staff-engineer-mode/skills/staff-engineer-mode/references/router-eval-set.yaml @@ -0,0 +1,628 @@ +cases: + - prompt: "We are splitting billing from checkout and need a design decision before we lock service boundaries, responsibility, and failure behavior." + expected_primary: architecture-decisions + expected_behavior: "infer architecture decision from boundaries, responsibility, and decision timing" + category: direct + expected_checks: [single_primary, intent_inference] + - prompt: "A partner-facing endpoint needs to rename a response field and return a new error shape, but older clients cannot break." + expected_primary: api-design-and-compatibility + expected_behavior: "route to compatibility and client migration instead of generic architecture" + category: direct + expected_checks: [single_primary, intent_inference] + - prompt: "Inspect the API changes in this branch and tell me what could break existing clients." + expected_primary: api-design-and-compatibility + expected_behavior: "route API branch check to client compatibility instead of generic PR review" + category: direct + expected_checks: [single_primary, intent_inference] + - prompt: "Inspect the endpoint code and API schema docs for this new pagination behavior before we merge." + expected_primary: api-design-and-compatibility + expected_behavior: "route endpoint/schema check before merge to API compatibility, not generic PR review" + category: direct + expected_checks: [single_primary, intent_inference] + - prompt: "Several SDKs and partner clients still read this response field; check compatibility and removal steps before changing it." + expected_primary: api-design-and-compatibility + expected_behavior: "route response field deprecation to exposed API contract migration" + category: direct + expected_checks: [single_primary, intent_inference] + - prompt: "We need operation names and request shapes for generated client methods that create, list, batch-get, and cancel jobs without confusing callers." + expected_primary: api-design-and-compatibility + expected_behavior: "route to API operation naming and generated-client contract ergonomics" + category: direct + expected_checks: [single_primary, intent_inference] + - prompt: "Let's design an API for partners to create, list, and cancel jobs without breaking future clients." + expected_primary: api-design-and-compatibility + expected_behavior: "route open-ended API design to compatibility and client contract guidance before generic brainstorming" + category: direct + expected_checks: [single_primary, intent_inference] + - prompt: "Design a batch endpoint and filtered list endpoint for partner resources, including item limits, partial failures, page tokens, and empty results." + expected_primary: api-design-and-compatibility + expected_behavior: "route to batch and list API contract shape instead of dependency resilience" + category: direct + expected_checks: [single_primary, intent_inference] + - prompt: "We need to decide which customer-facing failures count against checkout reliability and when they should page someone." + expected_primary: slo-and-error-budgets + expected_behavior: "route to reliability objectives and paging policy" + category: direct + expected_checks: [single_primary, intent_inference] + - prompt: "Look at this service's routes, dashboards, and alerts, then propose user-centered SLIs and SLOs." + expected_primary: slo-and-error-budgets + expected_behavior: "route SLI/SLO proposal to error-budget skill without inventing reliability-target aliases" + category: direct + expected_checks: [single_primary, intent_inference] + - prompt: "Inspect the alert rules against this service's SLOs and separate urgent alerts from follow-up-only alerts." + expected_primary: slo-and-error-budgets + expected_behavior: "route urgent-vs-follow-up rules tied to SLOs to error-budget skill" + category: direct + expected_checks: [single_primary, intent_inference] + - prompt: "Use the service code and recent incidents to draft error-budget release rules." + expected_primary: slo-and-error-budgets + expected_behavior: "route error-budget release rules to the exact slo-and-error-budgets skill" + category: direct + expected_checks: [single_primary, intent_inference] + - prompt: "The service has to keep serving traffic if an entire availability fault domain disappears during peak traffic." + expected_primary: high-availability-design + expected_behavior: "route to fault-domain topology, not disaster recovery" + category: paraphrase + expected_checks: [single_primary, intent_inference] + - prompt: "A location-scoped service must avoid cross-location runtime dependencies, show per-fault-domain health, and keep enough static capacity to survive one location loss." + expected_primary: high-availability-design + expected_behavior: "route to fault-domain independence and static failover capacity" + category: direct + expected_checks: [single_primary, intent_inference] + - prompt: "The order flow melts whenever the fraud service slows down, and retries seem to make the queue worse." + expected_primary: dependency-resilience + expected_behavior: "infer dependency overload from prose without relying on product names" + category: paraphrase + expected_checks: [single_primary, intent_inference] + - prompt: "The dependency client needs an adaptive retry budget, must stop retrying on overload signals, and batch retries should skip items that already succeeded." + expected_primary: dependency-resilience + expected_behavior: "route to retry-budget and partial-batch dependency behavior" + category: direct + expected_checks: [single_primary, intent_inference] + - prompt: "The bug report keeps blaming the cache product, but the actual failure is slow cache reads causing checkout retries and queue buildup." + expected_primary: dependency-resilience + expected_behavior: "translate named infrastructure into dependency overload behavior instead of routing by tool class" + category: paraphrase + expected_checks: [single_primary, capability_translation, intent_inference] + - prompt: "After the last release, tail latency doubled and the worker pool sits saturated during the evening spike." + expected_primary: performance-and-capacity + expected_behavior: "route to performance and capacity regression analysis" + category: direct + expected_checks: [single_primary, intent_inference] + - prompt: "A bad maintenance job corrupted rows, and we need to verify restore works before reopening writes." + expected_primary: backup-and-recovery + expected_behavior: "route to restore and corruption recovery; suggest incident response only if impact is active" + category: direct + expected_checks: [single_primary, intent_inference] + - prompt: "Plan a controlled failover exercise for the critical checkout path, including abort criteria and blast-radius limits." + expected_primary: resilience-experiments + expected_behavior: "route to experiment design, not topology design" + category: direct + expected_checks: [single_primary, intent_inference] + - prompt: "Design the new payout state machine before implementation, including states, transitions, must-never rules, must-eventually rules, and retry cases." + expected_primary: state-machine-correctness + expected_behavior: "route new state-machine design to correctness and invariant design" + category: direct + expected_checks: [single_primary, intent_inference] + - prompt: "Before shipping the distributed lock, we want invariants and counterexamples for split-brain behavior." + expected_primary: state-machine-correctness + expected_behavior: "route high-assurance distributed lock invariants and counterexamples to state-machine correctness" + category: direct + expected_checks: [single_primary, intent_inference] + - prompt: "During incidents we cannot tell whether failures start in checkout, payment, or fulfillment from the telemetry we have." + expected_primary: observability-and-alerting + expected_behavior: "route to observability because debuggability signals are the artifact" + category: paraphrase + expected_checks: [single_primary, intent_inference] + - prompt: "The incident is still ongoing, customers see failed checkouts, and we need a commander, status cadence, and timeline." + expected_primary: incident-response-and-postmortems + expected_behavior: "active incident handling takes priority over suspected root cause" + category: direct + expected_checks: [single_primary, intent_inference] + - prompt: "We get paged dozens of times a night for self-healing failures, and the runbook work is mostly manual." + expected_primary: oncall-health + expected_behavior: "route to page and toil reduction, not staffing policy" + category: direct + expected_checks: [single_primary, intent_inference] + - prompt: "Before merge, what tests and checks should block this payment workflow change from landing?" + expected_primary: testing-and-quality-gates + expected_behavior: "route to merge and quality checks" + category: direct + expected_checks: [single_primary, intent_inference] + - prompt: "This config change flips routing defaults for checkout, and we need validation, preview, user confirmation, and a rollback path before it runs." + expected_primary: configuration-and-automation-safety + expected_behavior: "route to configuration and automation safety rather than generic rollout" + category: direct + expected_checks: [single_primary, intent_inference] + - prompt: "An operational script will touch an unlaunched production environment, so we need a change record, user confirmation, preview, blast-radius limit, and recovery path." + expected_primary: configuration-and-automation-safety + expected_behavior: "route to production automation change controls even before launch" + category: direct + expected_checks: [single_primary, intent_inference] + - prompt: "What should we require before this lands?" + expected_primary: testing-and-quality-gates + expected_behavior: "infer a landing-check request and route to quality checks without intake questions" + category: paraphrase + expected_checks: [single_primary, intent_inference] + - prompt: "We need to cut a release candidate with a version, artifact identity, required checks, promotion path, and rollback target." + expected_primary: release-build-reproducibility + expected_behavior: "route release candidate mechanics and artifact identity to build reproducibility, not rollout" + category: direct + expected_checks: [single_primary, intent_inference] + - prompt: "Release builds are not reproducible across machines, and the artifact we promote is not always the one we tested." + expected_primary: release-build-reproducibility + expected_behavior: "route reproducible build and tested artifact identity problems to release build reproducibility" + category: direct + expected_checks: [single_primary, intent_inference] + - prompt: "We need to expose the new checkout path to one percent of traffic, watch guardrails, and roll back quickly if errors climb." + expected_primary: progressive-delivery + expected_behavior: "route to production exposure strategy" + category: direct + expected_checks: [single_primary, intent_inference] + - prompt: "Plan a customer-impacting deploy that moves through one instance, one partition, one deployment unit, and then location waves with bake time and automatic rollback on availability or latency." + expected_primary: progressive-delivery + expected_behavior: "route to phased deployment waves and rollback checks" + category: direct + expected_checks: [single_primary, intent_inference] + - prompt: "The new payments service is supposed to take full production traffic tomorrow, and leadership wants a readiness call." + expected_primary: production-readiness-review + expected_behavior: "use production readiness as an aggregator because launch readiness is explicit" + category: direct + expected_checks: [single_primary, intent_inference] + - prompt: "We are retiring the old account service over two quarters and need to move callers without backsliding." + expected_primary: migration-and-deprecation + expected_behavior: "route to broad migration and deprecation controls" + category: direct + expected_checks: [single_primary, intent_inference] + - prompt: "The runtime upgrade affects hundreds of services, and old and new versions must coexist until every component moves." + expected_primary: fleet-upgrades + expected_behavior: "route to fleet upgrade support windows and mixed-version compatibility" + category: direct + expected_checks: [single_primary, intent_inference] + - prompt: "Reviews sit idle for days, responsibility paths are unclear, and large changes keep landing late in the cycle." + expected_primary: none + expected_behavior: "withhold routing for generic review-workflow process with no concrete engineering surface" + category: out_of_scope + expected_checks: [scope_check] + forbidden_in_response: [all_specialist_names] + - prompt: "Our runbooks and design docs contradict each other; we need responsibility paths, source of truth, freshness rules, and archive criteria." + expected_primary: documentation-lifecycle + expected_behavior: "route to engineering documentation lifecycle instead of architecture decision work" + category: direct + expected_checks: [single_primary, intent_inference] + - prompt: "Make a small mechanical correction in one documentation file and commit it." + expected_primary: none + expected_behavior: "handle as routine docs maintenance without routing to a Staff Engineer Mode specialist" + category: out_of_scope + expected_checks: [scope_check] + forbidden_in_response: [all_specialist_names] + - prompt: "Clean up stale libraries, remove dead feature paths, and keep the lockfile from drifting again." + expected_primary: dependency-and-code-hygiene + expected_behavior: "route to dependency and code health, not supply-chain trust" + category: direct + expected_checks: [single_primary, intent_inference] + - prompt: "Threat model the new admin privilege flow, including abuse cases and security requirements before implementation." + expected_primary: secure-sdlc-and-threat-modeling + expected_behavior: "route to secure design and abuse-case reasoning" + category: direct + expected_checks: [single_primary, intent_inference] + - prompt: "Map service identities, key rotation, and who can read production secrets for this service." + expected_primary: identity-and-secrets + expected_behavior: "route to identity and secrets, not broad secure-design work" + category: direct + expected_checks: [single_primary, intent_inference] + - prompt: "Certificates expire next quarter, and the trust-chain change needs inventory, compatibility tests, renewal alerts, and revocation steps." + expected_primary: cryptography-and-key-lifecycle + expected_behavior: "route to cryptographic lifecycle instead of general identity or secrets work" + category: direct + expected_checks: [single_primary, intent_inference] + - prompt: "We need provenance, signing, and isolated builders so release artifacts can be trusted." + expected_primary: software-supply-chain-security + expected_behavior: "route to build and deploy trust controls" + category: direct + expected_checks: [single_primary, intent_inference] + - prompt: "A known exploited vulnerability is deployed in production, and we need prioritization, exception rules, and a fix deadline." + expected_primary: vulnerability-management + expected_behavior: "route to vulnerability triage and patch SLA" + category: direct + expected_checks: [single_primary, intent_inference] + - prompt: "A tenant should never be able to read another tenant's data, and noisy neighbors must not exhaust shared capacity." + expected_primary: tenant-isolation + expected_behavior: "route to tenant boundaries and blast-radius control" + category: direct + expected_checks: [single_primary, intent_inference] + - prompt: "Design deletion, retention, export, and minimization controls for user activity events." + expected_primary: privacy-and-data-lifecycle + expected_behavior: "route to privacy data lifecycle, not tenant isolation" + category: direct + expected_checks: [single_primary, intent_inference] + - prompt: "Map our engineering controls to records from normal engineering work for several standards without turning it into a legal policy exercise." + expected_primary: engineering-control-evidence + expected_behavior: "route only because multi-control engineering record mapping is explicit" + category: direct + expected_checks: [single_primary, intent_inference] + - prompt: "We need to choose the write model for account balances, including consistency, shard growth, and hot-key behavior." + expected_primary: distributed-data-and-consistency + expected_behavior: "route to storage and consistency design" + category: direct + expected_checks: [single_primary, intent_inference] + - prompt: "The order workflow uses events across payment, fulfillment, and email; we need replay, dead-letter handling, and schema evolution." + expected_primary: event-workflows + expected_behavior: "route to event workflow semantics" + category: direct + expected_checks: [single_primary, intent_inference] + - prompt: "We are designing a shared customer dataset for billing and support, and need schema responsibility, compatibility rules, and consumer checks before launch." + expected_primary: data-contracts + expected_behavior: "route new shared dataset contract design to data contracts instead of pipeline freshness or one API review" + category: direct + expected_checks: [single_primary, intent_inference] + - prompt: "Inspect the schema and downstream usage, then define the contract and removal steps for this field." + expected_primary: data-contracts + expected_behavior: "route schema downstream-usage removal steps to data contracts, not dependency resilience" + category: direct + expected_checks: [single_primary, intent_inference] + - prompt: "The product page shows stale inventory after updates, and cache invalidation keeps breaking during traffic spikes." + expected_primary: caching-and-derived-data + expected_behavior: "route to cache and derived-state operations" + category: direct + expected_checks: [single_primary, intent_inference] + - prompt: "The user profile cache serves stale entries after writes, and the invalidator misses some update paths." + expected_primary: caching-and-derived-data + expected_behavior: "route stale cache entries and invalidation mechanics to caching and derived data" + category: paraphrase + expected_checks: [single_primary, intent_inference] + - prompt: "Account balance reads may be stale across replicas, and we need to decide which consistency guarantees are acceptable." + expected_primary: distributed-data-and-consistency + expected_behavior: "route stale-read semantics and consistency guarantees to distributed data" + category: paraphrase + expected_checks: [single_primary, intent_inference] + - prompt: "Backfill a production table with no long locks, no replica lag surprise, and a rollback plan." + expected_primary: database-operations + expected_behavior: "route to production database execution" + category: direct + expected_checks: [single_primary, intent_inference] + - prompt: "The daily revenue pipeline missed its freshness target, and reprocessing can double-count late events." + expected_primary: data-pipeline-reliability + expected_behavior: "route to pipeline freshness and idempotent reprocessing" + category: direct + expected_checks: [single_primary, intent_inference] + - prompt: "The recommendation model needs release checks for drift, training-serving skew, rollback, and bad prediction spikes." + expected_primary: ml-reliability-and-evaluation + expected_behavior: "route to production ML reliability and evaluation" + category: direct + expected_checks: [single_primary, intent_inference] + - prompt: "New services keep hand-rolling setup; we need a golden path, catalog entry, scorecard, and default templates." + expected_primary: platform-golden-paths + expected_behavior: "route to internal platform standardization" + category: direct + expected_checks: [single_primary, intent_inference] + - prompt: "Infrastructure changes should be reconciled from declarative definitions, blocked by policy before deploy, and checked for drift." + expected_primary: infrastructure-and-policy-as-code + expected_behavior: "route to declarative infrastructure and policy enforcement" + category: direct + expected_checks: [single_primary, intent_inference] + - prompt: "Public traffic spikes and abusive clients are exhausting origin capacity before requests reach the application." + expected_primary: edge-traffic-and-ddos-defense + expected_behavior: "route to public edge defense and origin protection" + category: paraphrase + expected_checks: [single_primary, intent_inference] + - prompt: "Internal services need safer service discovery, authenticated service-to-service transport, and locality-aware routing." + expected_primary: internal-service-networking + expected_behavior: "route to internal networking and traffic policy" + category: direct + expected_checks: [single_primary, intent_inference] + - prompt: "Configure Istio retry, timeout, and circuit-breaker policy for east-west checkout calls that overload payment." + expected_primary: dependency-resilience + expected_behavior: "translate the mesh tool name into dependency overload controls" + category: paraphrase + expected_checks: [single_primary, capability_translation, intent_inference] + - prompt: "Design east-west service identity, discovery, and locality-aware traffic policy between checkout and payment." + expected_primary: internal-service-networking + expected_behavior: "route identity, discovery, and locality traffic policy to internal networking" + category: paraphrase + expected_checks: [single_primary, intent_inference] + - prompt: "Reduce infrastructure spend while preserving the current reliability target and enough headroom for peak traffic." + expected_primary: cost-aware-reliability + expected_behavior: "route because cost and reliability tradeoff are both explicit" + category: direct + expected_checks: [single_primary, intent_inference] + - prompt: "Scale checkout for ten times normal traffic during a promotion without adding customer-visible latency." + expected_primary: performance-and-capacity + expected_behavior: "do not route to cost work unless spend or allocation tradeoffs are explicit" + category: direct + expected_checks: [single_primary, intent_inference] + - prompt: "Secure the LLM agent before launch, especially tool permissions, prompt injection, retrieval boundaries, and unsafe output handling." + expected_primary: llm-application-security + expected_behavior: "route to LLM application security, not broad AI strategy" + category: direct + expected_checks: [single_primary, intent_inference] + - prompt: "Inspect a GenAI product for prompt injection, sensitive prompt redaction, prompt and response storage, retrieval leaks, and emergency stop controls." + expected_primary: llm-application-security + expected_behavior: "route GenAI application security controls to LLM application security" + category: direct + expected_checks: [single_primary, intent_inference] + - prompt: "The model-backed app accepts file uploads and renders model-generated markdown links; decide input validation, output handling, and unsafe-sink controls before launch." + expected_primary: llm-application-security + expected_behavior: "route multimodal and rendered-output risks to LLM application security" + category: direct + expected_checks: [single_primary, intent_inference] + - prompt: "Define retention, deletion, export, and access controls for stored prompts and responses in logs, support tooling, and analytics." + expected_primary: privacy-and-data-lifecycle + expected_behavior: "route prompt and response data lifecycle to privacy unless LLM boundary risk dominates" + category: paraphrase + expected_checks: [single_primary, intent_inference] + - prompt: "Verify signing, provenance, builder isolation, and admission controls for a model artifact before deployment, with no prompt, retrieval, or tool boundary in scope." + expected_primary: software-supply-chain-security + expected_behavior: "route generic model artifact trust to supply-chain security instead of LLM app security" + category: paraphrase + expected_checks: [single_primary, intent_inference] + - prompt: "Plan staged rollout and rollback of a model-backed feature using canary metrics, exposure rings, and a kill switch after guardrail metrics regress." + expected_primary: progressive-delivery + expected_behavior: "route model-backed rollout sequencing to progressive delivery instead of LLM app security" + category: paraphrase + expected_checks: [single_primary, intent_inference] + - prompt: "Secure our prompt so the agent cannot misuse tools, leak retrieved data, or follow malicious instructions." + expected_primary: llm-application-security + expected_behavior: "route prompt security with tool and retrieval risk to LLM application security" + category: paraphrase + expected_checks: [single_primary, intent_inference] + - prompt: "The prompt change needs eval cases, graders, thresholds, slice results, and regression history before we ship it." + expected_primary: llm-evaluation + expected_behavior: "route to LLM evaluation harnesses instead of LLM app security" + category: direct + expected_checks: [single_primary, intent_inference] + - prompt: "Evaluate our prompt with regression cases, rubric graders, thresholds, and slice coverage before the model-backed workflow changes." + expected_primary: llm-evaluation + expected_behavior: "route prompt evaluation details to the LLM eval harness skill" + category: paraphrase + expected_checks: [single_primary, intent_inference] + - prompt: "We need repo instructions for coding agents: allowed actions, protected paths, data boundaries, required tests, and traceability details." + expected_primary: ai-coding-governance + expected_behavior: "route to AI-assisted development controls instead of deployed LLM app security" + category: direct + expected_checks: [single_primary, intent_inference] + - prompt: "The native mobile release should stop if crash-free users drop, startup regresses, or offline sync breaks during staged rollout." + expected_primary: mobile-release-engineering + expected_behavior: "route to mobile release stability and crash budgets" + category: direct + expected_checks: [single_primary, intent_inference] + - prompt: "Stop browser UI performance regressions from shipping when payload weight, interaction readiness, or visual stability gets worse." + expected_primary: web-release-gates + expected_behavior: "route to technology-agnostic frontend performance release checks" + category: direct + expected_checks: [single_primary, intent_inference] + - prompt: "The checkout form release needs keyboard completion, focus order, labels, contrast, and assistive-technology checks before launch." + expected_primary: accessibility-gates + expected_behavior: "route to accessibility conformance checks instead of frontend performance" + category: direct + expected_checks: [single_primary, intent_inference] + - prompt: "The A/B test readout looks suspicious; check assignment balance, exposure logging, guardrail metrics, and sample-ratio mismatch before ramping." + expected_primary: experimentation-and-metric-guardrails + expected_behavior: "route to experiment validity and metric guardrails instead of rollout canary safety" + category: direct + expected_checks: [single_primary, intent_inference] + - prompt: "Before building the admin export flow, reason through abuse cases, input validation, and security requirements." + expected_primary: secure-sdlc-and-threat-modeling + expected_behavior: "route pre-deploy abuse-case reasoning to secure SDLC threat modeling" + category: paraphrase + expected_checks: [single_primary, intent_inference] + - prompt: "The admin export flow already deployed with a known exploitable vulnerability, and we need fix priority and a deadline." + expected_primary: vulnerability-management + expected_behavior: "route deployed vulnerable code to vulnerability management and patch SLA" + category: paraphrase + expected_checks: [single_primary, intent_inference] + - prompt: "Add enough backend headroom for the launch spike without increasing customer-visible latency." + expected_primary: performance-and-capacity + expected_behavior: "route headroom and latency without spend tradeoffs to capacity and performance" + category: paraphrase + expected_checks: [single_primary, intent_inference] + - prompt: "Reduce peak compute spend while preserving latency targets and enough launch headroom." + expected_primary: cost-aware-reliability + expected_behavior: "route explicit spend and reliability headroom tradeoffs to FinOps" + category: paraphrase + expected_checks: [single_primary, intent_inference] + - prompt: "Pages are noisy, and we also need reliability targets; first reduce wakeups while capturing the SLO follow-up." + expected_primary: oncall-health + expected_secondary: slo-and-error-budgets + expected_behavior: "choose page pain as primary and reliability target policy as secondary" + category: mixed_intent + expected_checks: [single_primary, secondary_cap, intent_inference] + - prompt: "Build Datadog dashboards that show where checkout failures start, then use those details to tune reliability objectives." + expected_primary: observability-and-alerting + expected_secondary: slo-and-error-budgets + expected_behavior: "translate the dashboard product into telemetry construction, with SLO policy as the separate follow-up" + category: mixed_intent + expected_checks: [single_primary, secondary_cap, capability_translation, intent_inference] + - prompt: "Rename a field in the partner API while also updating the shared customer dataset contract consumed by internal components." + expected_primary: api-design-and-compatibility + expected_secondary: data-contracts + expected_behavior: "choose partner-facing API compatibility as primary and shared data contracts as the separate artifact" + category: mixed_intent + expected_checks: [single_primary, secondary_cap, intent_inference] + - prompt: "Make the release artifact reproducible, then expose the tested checkout build to one percent of traffic with rollback criteria." + expected_primary: release-build-reproducibility + expected_secondary: progressive-delivery + expected_behavior: "choose artifact identity first, with production exposure as the secondary release artifact" + category: mixed_intent + expected_checks: [single_primary, secondary_cap, intent_inference] + - prompt: "The checkout form release needs keyboard accessibility checks and should also stop if browser interaction readiness regresses." + expected_primary: accessibility-gates + expected_secondary: web-release-gates + expected_behavior: "choose accessibility conformance as primary and browser performance checks as the separate secondary" + category: mixed_intent + expected_checks: [single_primary, secondary_cap, intent_inference] + - prompt: "Secure the LLM agent against prompt injection and add eval thresholds that block unsafe prompt changes." + expected_primary: llm-application-security + expected_secondary: llm-evaluation + expected_behavior: "choose LLM app security for unsafe-action risk and LLM eval harnesses for the release-check follow-up" + category: mixed_intent + expected_checks: [single_primary, secondary_cap, intent_inference] + - prompt: "Make alerts better for checkout." + expected_primary: observability-and-alerting + expected_behavior: "infer alert improvement for a checkout engineering surface without intake questions" + category: paraphrase + expected_checks: [single_primary, intent_inference] + - prompt: "Improve the mobile app." + expected_primary: mobile-release-engineering + expected_behavior: "infer mobile engineering quality and release guidance without intake questions" + category: paraphrase + expected_checks: [single_primary, intent_inference] + - prompt: "Use staff engineer mode to troubleshoot a network issue." + expected_primary: internal-service-networking + expected_behavior: "infer internal service networking investigation from an engineering network issue without intake questions" + category: paraphrase + expected_checks: [single_primary, intent_inference] + - prompt: "Troubleshoot a network issue." + expected_primary: internal-service-networking + expected_behavior: "infer internal service networking investigation from a terse network issue prompt without intake questions" + category: paraphrase + expected_checks: [single_primary, intent_inference] + - prompt: "Roll the database migration out safely while also adding checks that block future incompatible schema changes." + expected_primary: database-operations + expected_secondary: testing-and-quality-gates + expected_behavior: "choose database operations as primary because production schema execution carries the immediate risk" + category: mixed_intent + expected_checks: [single_primary, secondary_cap, intent_inference] + - prompt: "The streaming job is double-counting late events after replay, but the message contract and dead-letter workflow are also unclear." + expected_primary: data-pipeline-reliability + expected_secondary: event-workflows + expected_behavior: "choose pipeline reliability because freshness and idempotent reprocessing carry the failure; suggest workflow semantics as secondary" + category: mixed_intent + expected_checks: [single_primary, secondary_cap, intent_inference] + - prompt: "Launch the new checkout service after we finish threat modeling, load testing, dashboards, rollback, and on-call docs." + expected_primary: production-readiness-review + expected_behavior: "use readiness decision as the aggregator because launch readiness is the requested artifact" + category: mixed_intent + expected_checks: [single_primary, secondary_cap, intent_inference] + - prompt: "Write a marketing launch plan for the new checkout feature." + expected_primary: none + expected_behavior: "decline out-of-scope marketing work without asking intake questions or naming specialists" + category: out_of_scope + expected_checks: [scope_check] + forbidden_in_response: [all_specialist_names] + - prompt: "How much should we pay engineers for being on call?" + expected_primary: none + expected_behavior: "decline out-of-scope compensation work without asking intake questions or naming specialists" + category: out_of_scope + expected_checks: [scope_check] + forbidden_in_response: [all_specialist_names] + - prompt: "Inspect this AI-generated diff and tell me what a senior pre-merge pass would catch before we merge it." + expected_primary: agent-pr-review + expected_behavior: "route per-diff senior pre-merge review of agent output to agent PR review" + category: direct + expected_checks: [single_primary, intent_inference] + - prompt: "What did my agent miss in this branch — silent assumptions, hallucinated APIs, deleted-but-used code?" + expected_primary: agent-pr-review + expected_behavior: "route AI-failure-mode pre-merge check on a specific diff to agent PR review instead of org policy" + category: paraphrase + expected_checks: [single_primary, intent_inference] + - prompt: "Is this branch from an AI run safe to merge once the tests pass?" + expected_primary: agent-pr-review + expected_behavior: "route per-diff agent-output review to agent PR review instead of generic code review workflow" + category: paraphrase + expected_checks: [single_primary, intent_inference] + - prompt: "The rollout is finished; build me an inventory of every live flag with expiry, safe fallback, and removal plan." + expected_primary: feature-flag-lifecycle + expected_behavior: "route post-rollout flag inventory and removal planning to feature flag lifecycle" + category: direct + expected_checks: [single_primary, intent_inference] + - prompt: "Find the orphan flags in this repo whose feature shipped or whose behavior is no longer explained and propose a safe removal sequence." + expected_primary: feature-flag-lifecycle + expected_behavior: "route orphan flag detection and removal planning to feature flag lifecycle instead of generic code hygiene" + category: paraphrase + expected_checks: [single_primary, intent_inference] + - prompt: "Set token and p95 latency budgets for this LLM-backed route and define the fallback when the provider degrades." + expected_primary: llm-serving-cost-and-latency + expected_behavior: "route per-route LLM token and latency budgeting plus degradation path to LLM serving cost and latency" + category: direct + expected_checks: [single_primary, intent_inference] + - prompt: "Map per-call LLM spend to route, feature, and tenant, then design the prompt and response caches." + expected_primary: llm-serving-cost-and-latency + expected_behavior: "route LLM cost attribution and prompt/response cache design to LLM serving cost and latency instead of generic FinOps" + category: paraphrase + expected_checks: [single_primary, intent_inference] + - prompt: "Map this repo's module boundaries, names, and file sizes so an AI agent can locate the canonical implementation in one tool call." + expected_primary: code-readability-for-agents + expected_behavior: "route repo-as-artifact legibility mapping for AI comprehension to code readability for agents" + category: direct + expected_checks: [single_primary, intent_inference] + - prompt: "Find names in this codebase that collide or mislead code search and propose renames an agent can rely on." + expected_primary: code-readability-for-agents + expected_behavior: "route name-collision and code-search-misleading checks to code readability for agents instead of architecture decisions" + category: paraphrase + expected_checks: [single_primary, intent_inference] + - prompt: "Inventory the fixtures this suite depends on, verify the anonymization policy holds, and define when each fixture must refresh." + expected_primary: test-data-engineering + expected_behavior: "route fixture inventory, anonymization policy, and freshness decisions to test data engineering" + category: direct + expected_checks: [single_primary, intent_inference] + - prompt: "Find where production data shape has drifted from the data the tests run on and design a drift-detection check." + expected_primary: test-data-engineering + expected_behavior: "route production/test data drift detection to test data engineering instead of generic test strategy" + category: paraphrase + expected_checks: [single_primary, intent_inference] + - prompt: "Build a parity matrix across local, CI, staging, and production for this service and find divergences nobody named." + expected_primary: dev-environment-parity + expected_behavior: "route environment parity matrix and unnamed divergence discovery to dev environment parity" + category: direct + expected_checks: [single_primary, intent_inference] + - prompt: "This fix worked locally and failed in CI; trace the environment dimensions that differ and tell me which one hid the bug." + expected_primary: dev-environment-parity + expected_behavior: "route 'works only in one environment' failures to dev environment parity instead of build reproducibility" + category: paraphrase + expected_checks: [single_primary, intent_inference] + - prompt: "Inspect this downstream payment dependency call and find where retries could double-charge or duplicate work." + expected_primary: dependency-resilience + expected_behavior: "route downstream call retries and duplicate work to dependency resilience instead of event workflows" + category: direct + expected_checks: [single_primary, intent_inference] + - prompt: "Find runtime configuration drift and temporary overrides that an automation change could apply, then add owners, expiry, validation, and rollback." + expected_primary: configuration-and-automation-safety + expected_behavior: "route runtime config drift with automation rollback controls to configuration safety" + category: direct + expected_checks: [single_primary, intent_inference] + - prompt: "Build a production-readiness decision for this migration PR and identify launch blockers before we move traffic tomorrow." + expected_primary: production-readiness-review + expected_behavior: "route explicit readiness decision before traffic movement to PRR instead of rollout sequencing" + category: direct + expected_checks: [single_primary, intent_inference] + - prompt: "Inspect this dependency update and lockfile sweep for migration, hygiene, and rollback risks." + expected_primary: dependency-and-code-hygiene + expected_behavior: "route dependency update and lockfile sweep to dependency hygiene instead of generic PR review" + category: direct + expected_checks: [single_primary, intent_inference] + - prompt: "Inspect these on-call suppression rules and verify we are reducing page noise without hiding real user impact." + expected_primary: oncall-health + expected_behavior: "route suppression rules and page-noise reduction to oncall health instead of observability design" + category: direct + expected_checks: [single_primary, intent_inference] + - prompt: "Inspect this cross-service distributed-data lock and decide whether the consistency model is safe during failover or replication lag." + expected_primary: distributed-data-and-consistency + expected_behavior: "route distributed-data lock safety under failover and replication lag to consistency design" + category: direct + expected_checks: [single_primary, intent_inference] + - prompt: "Inspect this event message change and find producer or consumer replay, ordering, idempotency, or DLQ behavior that might break." + expected_primary: event-workflows + expected_behavior: "route event message behavior under replay, ordering, idempotency, and DLQ to event workflows" + category: direct + expected_checks: [single_primary, intent_inference] + - prompt: "Check the derived search-index refresh path and define stale-result freshness checks we can verify." + expected_primary: caching-and-derived-data + expected_behavior: "route derived search-index freshness and stale results to caching and derived data" + category: direct + expected_checks: [single_primary, intent_inference] + - prompt: "Use the query plan and schema migration diff to find why this endpoint got slower after the database change." + expected_primary: database-operations + expected_behavior: "route database-caused endpoint regression with query plan and schema migration details to database operations" + category: direct + expected_checks: [single_primary, intent_inference] + - prompt: "Define ML model-serving promotion checks for this endpoint, including eval thresholds, training-serving skew, drift, rollback, tests, metrics, and deploy workflow." + expected_primary: ml-reliability-and-evaluation + expected_behavior: "route ML model-serving promotion checks to ML reliability instead of generic readiness aggregation" + category: direct + expected_checks: [single_primary, intent_inference] + - prompt: "Inspect internal service-to-service routing config and keep this private dependency's traffic local when possible." + expected_primary: internal-service-networking + expected_behavior: "route internal traffic locality and service-to-service routing config to internal networking" + category: direct + expected_checks: [single_primary, intent_inference] diff --git a/plugins/sirmarkz/staff-engineer-mode/skills/staff-engineer-mode/references/router-phase-eval-set.yaml b/plugins/sirmarkz/staff-engineer-mode/skills/staff-engineer-mode/references/router-phase-eval-set.yaml new file mode 100644 index 00000000..e38391db --- /dev/null +++ b/plugins/sirmarkz/staff-engineer-mode/skills/staff-engineer-mode/references/router-phase-eval-set.yaml @@ -0,0 +1,221 @@ +cases: + - prompt: "Before we build asynchronous order events, design idempotency, ordering, replay, retry, and dead-letter behavior." + expected_primary: event-workflows + expected_behavior: "route pre-code event workflow design to event-workflows" + expected_phase: design + category: direct + expected_checks: [single_primary, intent_inference] + - prompt: "I want to add account deletion; help shape the retention, erasure, export, and audit behavior before implementation." + expected_primary: privacy-and-data-lifecycle + expected_behavior: "route ideation-stage personal-data lifecycle planning to privacy-and-data-lifecycle" + expected_phase: ideation + category: paraphrase + expected_checks: [single_primary, intent_inference] + - prompt: "We are planning a new checkout API; choose request shapes, errors, idempotency, pagination, and compatibility rules before code." + expected_primary: api-design-and-compatibility + expected_behavior: "route API ideation and design to compatibility before implementation exists" + expected_phase: design + category: direct + expected_checks: [single_primary, intent_inference] + - prompt: "Before implementing tenant quotas, design the tenant context, partitioning, noisy-neighbor limits, and cross-tenant tests." + expected_primary: tenant-isolation + expected_behavior: "route pre-code tenant-boundary design to tenant-isolation" + expected_phase: design + category: direct + expected_checks: [single_primary, intent_inference] + - prompt: "Plan the implementation of a staged rollout for this model-backed feature with canary metrics, kill switch, and rollback." + expected_primary: progressive-delivery + expected_behavior: "route release planning for staged exposure to progressive-delivery" + expected_phase: release + category: direct + expected_checks: [single_primary, intent_inference] + - prompt: "Before writing the distributed lock, help us define invariants, counterexamples, and stronger-than-example tests." + expected_primary: state-machine-correctness + expected_behavior: "route pre-development concurrency invariant design to state-machine-correctness" + expected_phase: testing + category: paraphrase + expected_checks: [single_primary, intent_inference] + - prompt: "Design observability for a new service so incidents show where user-visible failures start." + expected_primary: observability-and-alerting + expected_behavior: "route new-service telemetry design to observability-and-alerting" + expected_phase: design + category: direct + expected_checks: [single_primary, intent_inference] + - prompt: "Help design the implementation plan for a production backfill with throttling, verification, abort criteria, and delayed cleanup." + expected_primary: database-operations + expected_behavior: "route development planning for backfill execution to database-operations" + expected_phase: development + category: direct + expected_checks: [single_primary, intent_inference] + - prompt: "We are ideating an internal platform template for new services with telemetry, secrets, deploy, runbook, and owner defaults." + expected_primary: platform-golden-paths + expected_behavior: "route platform ideation for reusable safe defaults to platform-golden-paths" + expected_phase: ideation + category: paraphrase + expected_checks: [single_primary, intent_inference] + - prompt: "Before implementing a file-upload feature in the model-backed app, design prompt-injection, retrieval, tool, and output-sink controls." + expected_primary: llm-application-security + expected_behavior: "route pre-code LLM app security design to llm-application-security" + expected_phase: design + category: direct + expected_checks: [single_primary, intent_inference] + - prompt: "Plan tests and merge checks for a payment workflow change before implementation starts." + expected_primary: testing-and-quality-gates + expected_behavior: "route testing check design before development starts to testing-and-quality-gates" + expected_phase: testing + category: direct + expected_checks: [single_primary, intent_inference] + - prompt: "During development of the checkout inventory call, decide timeout, retry, fallback, and duplicate-work safeguards before wiring callers." + expected_primary: dependency-resilience + expected_behavior: "route development guidance for downstream failure handling to dependency-resilience" + expected_phase: development + category: direct + expected_checks: [single_primary, intent_inference] + - prompt: "Define a failure-injection test for the queue broker with blast-radius limits, stop conditions, and learning goals." + expected_primary: resilience-experiments + expected_behavior: "route testing-phase resilience probe design to resilience-experiments" + expected_phase: testing + category: direct + expected_checks: [single_primary, intent_inference] + - prompt: "Before releasing the mobile app, decide staged rollout pause metrics, crash thresholds, and forward-fix versus rollback rules." + expected_primary: mobile-release-engineering + expected_behavior: "route release decision guidance for mobile rollout to mobile-release-engineering" + expected_phase: release + category: direct + expected_checks: [single_primary, intent_inference] + - prompt: "After the flag ships everywhere, plan owner checks, expiry cleanup, and stale off-path removal." + expected_primary: feature-flag-lifecycle + expected_behavior: "route maintenance cleanup decisions for shipped flags to feature-flag-lifecycle" + expected_phase: maintenance + category: direct + expected_checks: [single_primary, intent_inference] + - prompt: "Set maintenance rules for stale runbooks, source-of-truth owners, refresh triggers, and archival criteria." + expected_primary: documentation-lifecycle + expected_behavior: "route maintenance controls for docs to documentation-lifecycle" + expected_phase: maintenance + category: direct + expected_checks: [single_primary, intent_inference] + - prompt: "We are deciding whether to build a new service boundary for refunds; guide the tradeoffs and revisit conditions before a design doc exists." + expected_primary: architecture-decisions + expected_behavior: "route ideation-stage architecture decision shaping to architecture-decisions" + expected_phase: ideation + category: direct + expected_checks: [single_primary, intent_inference] + - prompt: "Inspect this concrete diff before merge for intent match, behavior verification, edge cases, and deleted-but-still-used code." + expected_primary: agent-pr-review + expected_behavior: "route concrete pre-merge diff review to agent-pr-review" + category: direct + expected_checks: [single_primary, intent_inference] + - prompt: "An incident is active: checkout errors spiked after a deploy twenty minutes ago; build the timeline, owners, next update, and immediate decisions." + expected_primary: incident-response-and-postmortems + expected_behavior: "route active incident response to incident-response-and-postmortems" + category: direct + expected_checks: [single_primary, intent_inference] + - prompt: "Before tomorrow's tier-1 launch, run readiness across code, deploy config, dashboards, runbooks, rollback, and support details." + expected_primary: production-readiness-review + expected_behavior: "route launch go/no-go readiness to production-readiness-review" + expected_phase: release + category: direct + expected_checks: [single_primary, intent_inference] + - prompt: "A deployed advisory affects two customer-facing services; decide patch order, exploitability, exposure, exceptions, and remediation deadlines." + expected_primary: vulnerability-management + expected_behavior: "route deployed vulnerability remediation to vulnerability-management" + expected_phase: maintenance + category: direct + expected_checks: [single_primary, intent_inference] + - prompt: "Before any code exists, design the team review system: ownership routing, change-size limits, self-review rules, and reviewer latency targets." + expected_primary: none + expected_behavior: "withhold routing for generic review-system design with no concrete engineering surface" + category: out_of_scope + expected_checks: [scope_check] + forbidden_in_response: [all_specialist_names] + - prompt: "Review the checkout API design before code exists for error shapes, idempotency, pagination, and compatibility." + expected_primary: api-design-and-compatibility + expected_behavior: "route design-review wording by API compatibility surface, not to agent-pr-review" + expected_phase: design + category: paraphrase + expected_checks: [single_primary, intent_inference] + - prompt: "Run a security review of the new admin export design before implementation: authorization, trust boundaries, unsafe inputs, and abuse cases." + expected_primary: secure-sdlc-and-threat-modeling + expected_behavior: "route security-review wording by threat-modeling surface when no diff exists" + expected_phase: design + category: paraphrase + expected_checks: [single_primary, intent_inference] + - prompt: "Review the rollout plan for a staged release with canary metrics, abort criteria, and rollback before there is a go/no-go meeting." + expected_primary: progressive-delivery + expected_behavior: "route rollout-review wording by rollout surface until readiness is explicit" + expected_phase: release + category: paraphrase + expected_checks: [single_primary, intent_inference] + - prompt: "Before implementing a new admin export, threat-model authorization gaps, unsafe inputs, trust boundaries, and abuse cases." + expected_primary: secure-sdlc-and-threat-modeling + expected_behavior: "route pre-code security design away from deployed vulnerability management" + expected_phase: design + category: direct + expected_checks: [single_primary, intent_inference] + - prompt: "Design telemetry and alert signals for a new service so incident responders can tell where user-visible failures begin." + expected_primary: observability-and-alerting + expected_behavior: "route incident-prevention design away from active incident handling" + expected_phase: design + category: direct + expected_checks: [single_primary, intent_inference] + - prompt: "Plan a staged feature rollout with canary metrics and rollback criteria before there is a launch date or readiness decision." + expected_primary: progressive-delivery + expected_behavior: "route rollout design away from production-readiness-review until there is a go/no-go event" + expected_phase: release + category: paraphrase + expected_checks: [single_primary, intent_inference] + - prompt: "Shape a build-artifact reproducibility plan with pinned inputs, immutable artifact identity, and promote-only release flow." + expected_primary: release-build-reproducibility + expected_behavior: "route natural build-artifact wording to current release-build-reproducibility slug" + expected_phase: design + category: paraphrase + expected_checks: [single_primary, intent_inference] + - prompt: "Build an engineering records mapping plan that connects CI, approvals, dashboards, runbooks, exceptions, and release records." + expected_primary: engineering-control-evidence + expected_behavior: "route natural records-mapping wording to engineering-control-evidence" + expected_phase: design + category: paraphrase + expected_checks: [single_primary, intent_inference] + - prompt: "Plan codebase maintenance for deprecated helpers across packages with small batches, lockfile checks, and rollback checks." + expected_primary: dependency-and-code-hygiene + expected_behavior: "route natural codebase-maintenance wording to dependency-and-code-hygiene" + expected_phase: maintenance + category: paraphrase + expected_checks: [single_primary, intent_inference] + - prompt: "Before promoting the fraud model, design model reliability checks for skew, drift, eval slices, rollback, and serving details." + expected_primary: ml-reliability-and-evaluation + expected_behavior: "route natural model-reliability wording to ml-reliability-and-evaluation" + expected_phase: release + category: paraphrase + expected_checks: [single_primary, intent_inference] + - prompt: "The checkout inventory call can duplicate work after a timeout; decide timeout budget, fallback behavior, idempotency, and verification points." + expected_primary: dependency-resilience + expected_behavior: "infer dependency-resilience from failure context without explicit phase wording" + expected_phase: development + category: paraphrase + expected_checks: [single_primary, intent_inference] + - prompt: "A flag is on everywhere and old branches now confuse support; decide owner checks, expiry cleanup, and safe removal checks." + expected_primary: feature-flag-lifecycle + expected_behavior: "infer feature-flag lifecycle from old-branch context without explicit phase wording" + expected_phase: maintenance + category: paraphrase + expected_checks: [single_primary, intent_inference] + - prompt: "This queue workflow can lose work when email succeeds before payment settles; define the message contract, idempotency keys, retry and DLQ handling, replay behavior, and failure probes." + expected_primary: event-workflows + expected_behavior: "infer event workflow testing/design from failure context without explicit phase wording" + expected_phase: testing + category: paraphrase + expected_checks: [single_primary, intent_inference] + - prompt: "The public signup path is getting suspicious spikes; choose edge limits that protect origin without blocking real users." + expected_primary: edge-traffic-and-ddos-defense + expected_behavior: "infer edge-defense routing from traffic-risk context without explicit phase wording" + expected_phase: release + category: paraphrase + expected_checks: [single_primary, intent_inference] + - prompt: "Write warmer onboarding copy for a marketing landing page." + expected_primary: none + expected_behavior: "decline product/editorial ideation without naming specialists" + category: out_of_scope + expected_checks: [scope_check] + forbidden_in_response: [all_specialist_names] diff --git a/plugins/sirmarkz/staff-engineer-mode/skills/staff-engineer-mode/references/routing-matrix.md b/plugins/sirmarkz/staff-engineer-mode/skills/staff-engineer-mode/references/routing-matrix.md new file mode 100644 index 00000000..63b2c706 --- /dev/null +++ b/plugins/sirmarkz/staff-engineer-mode/skills/staff-engineer-mode/references/routing-matrix.md @@ -0,0 +1,90 @@ +# Routing Matrix Notes + +## Decision Frame + +1. Identify the requested artifact: decision, design, plan, readiness check, rollout, investigation, runbook, migration, eval, control pack, or diff review. +2. Identify the work phase: ideation, design, development, testing, pre-merge, launch, migration, active incident, post-incident, regression, readiness, or steady-state maintenance. +3. Identify the dominant risk: availability, latency, durability, correctness, security, privacy, compatibility, operator load, cost, release safety, or customer experience. +4. Route to one primary. Add a secondary only when the user explicitly asks for a separate artifact. +5. Infer missing artifact, phase, surface, and risk from prompt, repo, files, branch context, and conversation; withhold routing only when no in-scope engineering lifecycle/control frame exists. + +## Exact Slug Guardrails + +Return the canonical `specialists/.md` slug, not a semantic alias. + +- Fault-domain topology, static failover capacity, location-loss survivability, and availability assumptions are `high-availability-design`; chaos tests, game days, failover drills, and fault injection are `resilience-experiments`. +- New API surfaces, operation/resource shape, generated-client ergonomics, existing-client callers, backwards compatibility, and safe deprecation of an exposed API response field are `api-design-and-compatibility`, not broad migration. +- RTO/RPO, backup, restore, corruption, accidental deletion, or DR restore tests are `backup-and-recovery`, not backup/restore aliases. +- Showing that a controlled failure test itself is safe and scoped is `resilience-experiments`; showing that the topology has enough already-available capacity under domain loss is `high-availability-design`. +- Downstream dependency calls, retries, timeouts, idempotency, duplicate work, and overload behavior are `dependency-resilience`, unless the prompt is mainly event replay, ordering, or DLQ behavior. +- Designing or building state machines, protocol correctness, locking, concurrency, invariants, property tests, fuzzing, simulation, or model checking are `state-machine-correctness`. +- Production config changes, generated operations, bulk scripts, validation, preview, blast-radius limits, abort paths, or rollback before mutation are `configuration-and-automation-safety`, not config aliases. +- Runtime configuration drift and temporary overrides that need owners, expiry, validation, or rollback before automation applies them are `configuration-and-automation-safety`; desired-state capture, drift detection, reconciliation, and emergency exception rules after manual infrastructure changes are `infrastructure-and-policy-as-code`. +- Generic code-review purpose, change-size limits, ownership, review latency, and workflow metrics have no routed specialist unless a concrete engineering surface is present; one concrete pre-merge diff review is `agent-pr-review`. +- A concrete diff, branch, PR, or last-commit review stays with `agent-pr-review` even when the prompt says tests pass, changed behavior needs verification, or edge cases may be missing; test strategy without a concrete diff is `testing-and-quality-gates`. +- A deprecation PR, sunset change, or removal diff that asks for no-new-usage checks, migration controls, or backsliding prevention routes to `migration-and-deprecation`, not `agent-pr-review`. +- Static-analysis backlogs, warning ratchets, dead-code cleanup, and maintenance-risk prioritization route to `dependency-and-code-hygiene` even when changed files are available; changed files alone do not make the request a pre-merge diff review. +- The word "review" is not enough to select `agent-pr-review`: design review, security review, API review, data review, rollout review, or test review without a concrete diff routes by the engineering surface. +- A surface-specific change before merge still routes to the narrow surface specialist when the requested artifact is compatibility, safety, rollout, security, accessibility, data, or test results rather than a general diff verdict. +- Dependency updates, lockfile sweeps, migration notes, rollback risks, and small-batch hygiene are `dependency-and-code-hygiene`, even when packaged as a PR. +- On-call suppression rules, noisy pages, responder load, toil, and checking that page reduction does not hide user impact are `oncall-health`; new alert design remains `observability-and-alerting`. +- Cross-service distributed-data locks tied to consistency, failover, conflicts, or replication lag are `distributed-data-and-consistency`; local protocol invariants remain `state-machine-correctness`. +- Event message producer/consumer replay, ordering, idempotency, and DLQ behavior are `event-workflows`; shared schema compatibility alone is `data-contracts`. +- Derived search indexes, materialized views, cache invalidation, and stale-result freshness are `caching-and-derived-data`; batch or streaming pipeline freshness is `data-pipeline-reliability`. +- Query plans, schema migrations, indexes, backfills, locks, and database-caused endpoint regressions are `database-operations`; general hot-path or capacity regressions are `performance-and-capacity`. +- Database migration execution is `database-operations`; if the same prompt separately asks for future blocking checks against incompatible schema changes, add `testing-and-quality-gates` as the secondary. +- Threat models, trust boundaries, data flows, abuse cases, and residual-risk registers are `secure-sdlc-and-threat-modeling`, not threat-modeling aliases. +- Source-to-deploy trust, isolated builders, provenance, signing, deployment admission, or untrusted artifact risk are `software-supply-chain-security`. +- Deployed vulnerabilities, exploitability, exposure, patch SLAs, remediation rollout, and expiring exceptions are `vulnerability-management`. +- Model-serving promotion, eval thresholds, training-serving skew, drift monitors, model rollback, and model endpoint replacement checks are `ml-reliability-and-evaluation`; broad launch readiness without ML risk is `production-readiness-review`. +- Internal service-to-service routing, discovery, locality, identity, and private dependency traffic policy are `internal-service-networking`; dependency version cleanup is `dependency-and-code-hygiene`. +- AI coding-agent repo rules, protected paths, required tests, data boundaries, and generated-code acceptance checks are `ai-coding-governance`. +- LLM eval harnesses, datasets, graders, thresholds, slice coverage, and regression history are `llm-evaluation`. + +## High-Risk Boundaries + +- Reliability targets, SLO-based alert tuning, and urgent/follow-up rules route to `slo-and-error-budgets`; telemetry construction routes to `observability-and-alerting`; alert fatigue routes to `oncall-health`. +- When a prompt mixes noisy pages and missing reliability targets, route the immediate operator pain to `oncall-health` and use `slo-and-error-budgets` only as a secondary policy artifact. +- Launch readiness routes to `production-readiness-review` only when launch, major traffic shift, tier upgrade, or broad readiness checks are explicit. Generic design decisions route elsewhere. +- Active incident command, live mitigation, and postmortem authorship route to `incident-response-and-postmortems` before root-cause specialty work. +- Newer narrow routes beat broad neighbors when their artifact is present: config/automation safety, documentation lifecycle, data contracts, accessibility checks, AI coding controls, agent PR review, LLM eval, experimentation guardrails, fleet upgrades, cryptography/key lifecycle, feature flag lifecycle, LLM serving cost and latency, code readability for agents, test data engineering, and dev environment parity. +- Fault-domain topology routes to `high-availability-design`; restore capability routes to `backup-and-recovery`; controlled failure tests route to `resilience-experiments`. +- Release cutting, release trains/candidates, build and artifact creation, packaging, and promotion mechanics route to `release-build-reproducibility`; production exposure and rollback route to `progressive-delivery`. +- Config, feature settings, generated operations, and automation mutation route to `configuration-and-automation-safety`; production exposure still routes to `progressive-delivery`. +- Rollout and rollback plans for any production-affecting change, including config, schema, data, or client changes, route to `progressive-delivery`; one-shot mutation without staged exposure stays with `configuration-and-automation-safety`. +- Declarative infrastructure changes with policy checks, drift detection, and reconciliation route to `infrastructure-and-policy-as-code`; ad-hoc config or automation runs against production state stay with `configuration-and-automation-safety`. +- Engineering docs route to `documentation-lifecycle` only when responsibility, source of truth, freshness, operational accuracy, lifecycle checks, or stale/missing guidance are the artifact. Routine editorial or mechanical documentation maintenance should be handled directly without a Staff Engineer Mode specialist. Architecture decisions still route to `architecture-decisions`. +- Normal merge/release checks route to `testing-and-quality-gates`; protocol, state-machine, or concurrency assurance routes to `state-machine-correctness`. +- Accessibility conformance for user-facing flows routes to `accessibility-gates`; client performance still routes to `web-release-gates` or `mobile-release-engineering`. +- Broad migrations, legacy retirement, and capability sunset route to `migration-and-deprecation`; routine cleanup routes to `dependency-and-code-hygiene`; new or changed exposed API contracts route to `api-design-and-compatibility`. +- Fleet upgrades, support windows, and mixed-version rollout route to `fleet-upgrades`; routine package updates stay with `dependency-and-code-hygiene`. +- Supply-chain trust controls route to `software-supply-chain-security`; deployed vulnerability remediation routes to `vulnerability-management`; routine dependency updates route to `dependency-and-code-hygiene`. +- Pre-deploy abuse-case and control reasoning routes to `secure-sdlc-and-threat-modeling`; already-deployed vulnerable code routes to `vulnerability-management`; trust in the build path routes to `software-supply-chain-security`. +- Cryptographic agility, certificate expiry, key rotation, and trust-chain lifecycle route to `cryptography-and-key-lifecycle`; runtime access and secrets policy stays with `identity-and-secrets`. +- Post-rollout feature-flag inventory, expiry, fallback behavior, removal plans, and orphan flag debt route to `feature-flag-lifecycle`; introducing the flag during rollout stays with `progressive-delivery`; generic dead-code cleanup stays with `dependency-and-code-hygiene`. +- Per-route LLM token budgets, tail-latency budgets, prompt and response caches, provider-failure degradation paths, and per-feature LLM cost attribution route to `llm-serving-cost-and-latency`; generic backend latency and capacity stays with `performance-and-capacity`; generic spend/reliability tradeoffs stay with `cost-aware-reliability`; generic remote-call retries, timeouts, and circuit breakers stay with `dependency-resilience`. +- Service/module/worker boundary ownership routes to `architecture-decisions`, even when retry policy is mentioned; concrete timeout, retry, idempotency, queue, or overload policy for an existing dependency routes to `dependency-resilience`. +- Repository legibility for AI comprehension, module-boundary maps, code-search-collision checks, function and file-size budgets, and one-tool-call locatability route to `code-readability-for-agents`; macro service boundaries stay with `architecture-decisions`; per-diff pre-merge review stays with `agent-pr-review`. +- Fixture inventory, anonymization of production-derived test data, fixture freshness-versus-determinism choices, and production/test data drift route to `test-data-engineering`; overall test strategy, CI signals, and merge-blocking checks stay with `testing-and-quality-gates`. +- Local, CI, staging, and production parity matrices, drift budgets, allowed-versus-required divergence, and "works only in one environment" failures route to `dev-environment-parity`; reproducible release artifacts and build-once/promote-many remain with `release-build-reproducibility`. +- Data pipeline freshness, lineage, and idempotent reprocessing route to `data-pipeline-reliability`; message contracts, replay semantics, and workflow orchestration route to `event-workflows`. +- New or changed cross-surface data contracts, producer/consumer schema evolution, and domain-interface responsibility route to `data-contracts`; single API contract changes stay with `api-design-and-compatibility`. +- Cache invalidation, derived values, and stale cache entries route to `caching-and-derived-data`; deciding whether stale reads are allowed by the storage model routes to `distributed-data-and-consistency`. +- Data model splits across databases, shards, or mutation boundaries route to `distributed-data-and-consistency`, even when a migration is mentioned; executing schema, backfill, index, or destructive data changes routes to `database-operations`. +- Cross-service workflows whose correctness depends on a database, storage, replication, sharding, or failover route to `distributed-data-and-consistency`; event/message replay, ordering, and DLQ behavior stay with `event-workflows`; in-process state machines, protocols, and concurrency invariants without storage semantics stay with `state-machine-correctness`. +- AI-assisted repo workflow, agent instructions, data boundaries, and generated-code acceptance route to `ai-coding-governance`; deployed LLM app security stays with `llm-application-security`. +- Any specific diff that needs a senior pre-merge review (human, AI, or mixed) routes to `agent-pr-review`; org-level AI coding controls still route to `ai-coding-governance`; generic review routing, responsibility, change size, and DORA workflow do not route unless tied to a concrete engineering surface; explicit launch readiness still routes to `production-readiness-review`; an active incident still routes to `incident-response-and-postmortems` first. +- System-level review rules, change-size limits, review-latency targets, and reviewer routing do not route by themselves; org-level AI coding rules for allowed actions, protected paths, secret/data boundaries, and required verification details route to `ai-coding-governance`. +- LLM tool, prompt-injection, retrieval-boundary, and unsafe-output risk routes to `llm-application-security`; LLM eval datasets, graders, thresholds, and regression checks route to `llm-evaluation`; production ML serving and drift stay with `ml-reliability-and-evaluation`. +- LLM prompt/response storage, session isolation, rollback, and artifact provenance stay with `llm-application-security` only when tied to prompt/retrieval/tool/output boundaries; otherwise route data lifecycle to `privacy-and-data-lifecycle`, tenant boundaries to `tenant-isolation`, rollout sequencing to `progressive-delivery`, and generic supply-chain trust to `software-supply-chain-security`. +- Tenant-boundary isolation proofs route to `tenant-isolation`, even when triggered by an incident; live incident command stays with `incident-response-and-postmortems`. +- Experiments, holdouts, exposure logging, and metric validity route to `experimentation-and-metric-guardrails`; operational canaries stay with `progressive-delivery`. +- Single-surface verification details stay with the matching specialist. `engineering-control-evidence` is for cross-surface control mapping, exception records, scorecards, and control packs. +- Public edge traffic defense routes to `edge-traffic-and-ddos-defense`; internal service-to-service traffic policy routes to `internal-service-networking`. +- Retry, timeout, circuit-breaker, load-shedding, and dependency overload policy routes to `dependency-resilience` even when implemented through internal traffic tooling; service identity, discovery, transport, and locality route to `internal-service-networking`. +- Browser or web client release checks, including field/lab performance signals, loading, interaction readiness, layout stability, runtime errors, payload growth, accessibility smoke, or a concrete UI PR review focused on those checks, route to `web-release-gates`; native mobile rollouts route to `mobile-release-engineering`; backend latency and headroom route to `performance-and-capacity`. +- Headroom and latency without spend tradeoffs route to `performance-and-capacity`; cost, spend, allocation, or reliability/cost tradeoffs route to `cost-aware-reliability`; pure billing work is out of scope. + +## Scope + +Product discovery, marketing, staffing, compensation, procurement, legal/auditor liaison, and broad compliance-program work are out of scope unless reframed as concrete engineering controls. diff --git a/plugins/sirmarkz/staff-engineer-mode/specialists/accessibility-gates.md b/plugins/sirmarkz/staff-engineer-mode/specialists/accessibility-gates.md new file mode 100644 index 00000000..93083c19 --- /dev/null +++ b/plugins/sirmarkz/staff-engineer-mode/specialists/accessibility-gates.md @@ -0,0 +1,123 @@ +--- +name: accessibility-gates +description: "Use when designing or releasing user-facing flows needing keyboard, screen-reader, focus, contrast, or assistive checks" +--- + +# Accessibility Conformance Checks + +## Iron Law + +``` +NO CRITICAL USER FLOW SHIPS WITHOUT A NAMED CONFORMANCE LEVEL, ASSISTIVE-TECH CHECK, AND REGRESSION CHECK +``` + +Pick the conformance level explicitly (for most public web work, WCAG 2.x AA is the named target). Run the critical flow with at least one assistive-technology path before release. Add a regression check so the same defect cannot recur silently. For a solo developer or tiny project, the check can be a keyboard-only and screen-reader walkthrough recorded once per release; the discipline is that the walkthrough happened, not that anyone else performed it. + +## Overview + +Accessibility is a release quality property, not a post-launch polish pass. + +**Core principle:** check critical user journeys on semantic structure, keyboard access, focus behavior, visual contrast, assistive-technology behavior, and regression checks. + +## When To Use + +- The user is designing, building, changing, or releasing a user-facing flow that needs accessibility, conformance, assistive-technology support, keyboard navigation, focus order, contrast, labels, or release checks. +- A UI change affects forms, navigation, dialogs, errors, media, dynamic updates, or critical journeys. +- Automated checks and manual checks need to be combined into a release decision. +- A regression blocks users from perceiving, operating, or understanding the interface. + +## When Not To Use + +- The main issue is loading speed, responsiveness, visual stability, or runtime errors; use `web-release-gates` instead. +- The main issue is native crash, startup, offline, or app-store rollout risk; use `mobile-release-engineering` instead. +- The request is brand design or marketing copy without accessibility engineering risk. +- The work is a legal policy discussion without concrete engineering checks. + +## Info To Gather + +- Current work phase, next decision, what is known, and assumptions where details are missing. +- Critical journeys, user surfaces, target conformance level, supported input modes, and assistive technologies. +- Changed components, labels, roles, focus behavior, keyboard paths, error handling, contrast, and dynamic content. +- Existing automated checks, manual test scripts, defect history, and release-blocking rules. +- Exceptions, expiry, severity, affected users, and compensating path. +- Telemetry or support signals for accessibility regressions where available. + +## Workflow + +1. **Define the target.** State the conformance expectation and critical journeys before evaluating details. +2. **Map the journey.** Identify every step, control, message, focus transition, and error state a user must complete. +3. **Check semantics and names.** Ensure controls expose meaningful structure, labels, state, and relationships. +4. **Verify operation.** Test keyboard-only and assistive-technology paths for completion, not just component snapshots. +5. **Check perception.** Review contrast, text resizing, motion, timing, media alternatives, and status updates where relevant. +6. **Combine results.** Use automated checks for broad regressions and manual checks for interaction quality. +7. **Check release.** Block critical journey failures; track lower-risk defects with severity, expiry, and retest date. +8. **Prevent recurrence.** Add component tests, examples, lint rules, or review checks for repeated failure patterns. + +## Synthesized Default + +Check critical journeys with a named conformance target, automated checks, manual assistive-technology scripts, keyboard completion tests, dated exceptions with repair criteria, and regression tests for known defects. Accessibility checks should be part of launch readiness for user-facing changes. + + + +## Phase Behavior + +- Ideation: identify risks, defaults, unknowns, options, and the next decision before code exists. +- Design: shape the target artifact, tradeoffs, checks, and details to gather. +- Development: guide sequencing, code boundaries, checks, and acceptance criteria. +- Testing: define release-blocking tests, evals, fixtures, and failure probes. +- Release: define rollout, observability, abort, rollback, and readiness details. +- Maintenance: define owners, drift checks, cleanup triggers, and refresh cadence. +- Existing artifact: use current code, docs, telemetry, incidents, or diffs as context for the next engineering decision; do not wait for a finished artifact before guiding design, build, release, or operation. +- Missing details: state assumptions and say what to check next instead of blocking lifecycle guidance. + +## Exceptions + +- Internal tools may use a narrower journey set only when the affected user group and alternative path are explicit. +- Emergency fixes can ship with a tracked accessibility exception only when delaying is riskier and a repair date exists. +- Automated checks are not enough for complex interactions; manual verification remains required for critical flows. + +## Response Quality Bar + +- Lead with the accessibility release decision, blocker list, conformance gap, or test plan requested. +- Cover target, critical journeys, semantics, keyboard behavior, focus, assistive-technology checks, contrast, exceptions, and regression checks before optional design advice. +- Name one concrete assistive-technology path for at least one critical journey, such as NVDA, VoiceOver, JAWS, TalkBack, Dragon, or switch control, with a pass/fail criterion for completing that journey. +- Make recommendations actionable with severity, blocking status, retest steps, and release criteria where relevant. +- For recurring defects or launch blockers, make the regression mechanism concrete: name the CI check, lint rule, component test, or recurring manual checklist; include the verification check, the test that would fail without the change, covered environment inventory such as local, CI, staging, or production-like, docs-as-code checklist location, and refresh cadence. +- Name the details to inspect, such as journey list, automated results, manual scripts, screenshots or recordings, defect history, and exception records; do not state details you have not seen. +- Stay technology-agnostic by default: do not introduce provider, product, framework, database, protocol, or command names unless the user supplied them, explicitly requested tool-specific guidance, or a named assistive technology is needed for test results. +- Stay inside accessibility engineering. Route performance, mobile rollout, or broad legal policy only when those are central. +- Be concise: prefer journey-based check tables over broad accessibility lectures. + +## Required Outputs + +- Accessibility conformance target and journey inventory. +- Release check matrix: automated checks, manual checks, blocking status, and repair path. +- Critical journey manual test script. +- Exception register with severity, expiry, compensating path, and retest. +- Regression-prevention plan for recurring defects. +- Follow-up routes for performance or mobile-specific release risk where needed. + +## Checks Before Moving On + +- `target_defined`: conformance expectation and critical journeys are named. +- `journey_complete`: users can complete critical flows through supported input and assistive paths. +- `mixed_testing`: automated checks and hands-on testing are both used where interaction quality matters. +- `exception_responsibility`: every exception has severity, user-confirmed reason, expiry, and compensating path. +- `regression_check`: known failures have tests or checks to prevent recurrence. + +## Red Flags - Stop And Rework + +- Automated checks pass, but nobody tested the critical journey. +- Focus is trapped, lost, or moves unpredictably. +- Controls have visible labels but no reliable accessible names. +- Error messages are visible but not announced or associated with fields. +- Accessibility exceptions have no repair date or verification path. + +## Common Mistakes + +| Mistake | Correction | +| --- | --- | +| Component-only checks | Test complete user journeys. | +| Automation as the whole answer | Add manual interaction verification. | +| Treating all defects alike | Block critical journey failures first. | +| Exceptions without expiry | Require compensating path, and retest. | diff --git a/plugins/sirmarkz/staff-engineer-mode/specialists/agent-pr-review.md b/plugins/sirmarkz/staff-engineer-mode/specialists/agent-pr-review.md new file mode 100644 index 00000000..6cebe962 --- /dev/null +++ b/plugins/sirmarkz/staff-engineer-mode/specialists/agent-pr-review.md @@ -0,0 +1,155 @@ +--- +name: agent-pr-review +description: "Use when a PR or diff needs senior pre-merge review of intent, behavior verification, edge cases, or residual risk" +--- + +# Pre-Merge PR Review + +## Iron Law + +``` +NO DIFF MERGES WITHOUT VERIFIED INTENT, BEHAVIOR VERIFICATION, ASSIGNED RISK, AND A FAILURE-MODE PASS +``` + +If the stated intent does not match the actual diff, or the diff cannot show that the changed behavior was exercised by a test that would fail without the change, the diff is not reviewable yet. + +## Overview + +The default pre-merge review pass. Applies whether the diff was written by a human, by an AI coding agent, or by both. Modern diffs increasingly contain AI-assisted code that looks plausible, so every review treats the diff as untrusted until intent, behavior verification, responsibility, and common failure modes (silent assumptions, plausible-but-wrong logic, hallucinated APIs, deleted-but-used code, scope creep, missing edge cases) have been checked against the actual change set. + +**Core principle:** review the diff against its originating task, not against the author's self-summary. The summary is a hypothesis; the diff is the source of truth. + +## When To Use + +- The user asks to review a PR, branch, diff, or change set before merging — regardless of who or what produced it. +- A coding agent has just finished a multi-file change, refactor, migration, or new feature and the user is deciding whether to merge. +- The user asks "is this safe to merge," "what would a senior review catch here," "review my last commit," "review this PR," "find risks in this diff," or "did the agent miss anything." +- The author's summary may not match what actually changed. +- The change touches paths the author was not explicitly scoped to and needs an explicit intent check. + +## When Not To Use + +- The work is pre-design: there is no diff yet; use `architecture-decisions` or `secure-sdlc-and-threat-modeling` instead. +- A live incident is underway; use `incident-response-and-postmortems` instead first. +- The request is org-level rules for AI-assisted work, not a single diff; use `ai-coding-governance` instead. +- The request is review routing, change-size policy, responsibility policy, or workflow metrics rather than a concrete diff; no routed specialist applies unless the prompt names a concrete engineering surface. +- The request is launch readiness across multiple surfaces with an explicit launch event; use `production-readiness-review` instead. +- The PR is primarily a deprecation, sunset, or removal-control artifact; use `migration-and-deprecation` instead. +- The request is static-analysis, warning, dead-code, or maintenance-risk prioritization over changed files; use `dependency-and-code-hygiene` instead. +- The diff is one trivial fix the human author can self-review without a structured pass. + +## Info To Gather + +- **Diff scope:** files changed, lines added/removed, public-surface changes, generated-file changes, and deleted code. +- **Authorship context:** human, AI agent, or mixed; which agent or contributor produced the diff; what prompt or task it was given; what the task summary says changed. +- **Change type:** new feature, refactor, bug fix, dependency update, migration, generated code, or mixed. +- **Environment context:** target repo's tier, exposed surfaces, user-stated scope, local responsibility metadata or recent commits when available, and whether the change touches production paths, data, or shared libraries. +- **Test coverage state:** which tests exist for the touched paths, which were added, and which were modified or deleted. +- **Prior review state:** whether a human or other agent has already passed over the diff and what was flagged. +- **Stated intent versus diff:** the author's or agent's summary, the originating task, and the actual file-by-file delta. + +## Workflow + +1. **Reconstruct intent.** Restate what the change is supposed to do in one sentence, sourced from the task or PR description, not from the author's self-summary. Anchor the intent in the actual diff with at least one concrete file/function/line signal when available. Note any gap between intent and the diff's actual surface area. +2. **Map the diff.** Group changes by purpose: behavior change, refactor, test, generated/mechanical, dependency, configuration, deletion. Flag any group the stated intent does not justify as scope creep. +3. **Pin review anchors.** Before writing the verdict, select at least two separate changed locations from the diff and cite them as `file:line` in the final review. Prefer line-numbered changed files; if only a patch is available, cite the hunk file and added-line number from the patch. These anchors should include the most important behavior or risk locations, not just file names. +4. **Run the failure-mode pass.** For each change, check for: silent assumptions, plausible-but-wrong logic, hallucinated APIs or imports, deleted-but-still-used code, unmotivated edits, missing edge cases a careful review would consider, mismatched error handling, and copied-pattern code that does not match the local convention. These checks apply whether the diff is human, AI, or mixed; AI-assisted code raises the prior probability of each. +5. **Verify behavior is exercised.** Confirm the changed behavior has tests that fail without the change. New behavior without a failing-without-the-change test is treated as unverified. +6. **Check correctness on real inputs.** Look for boundary conditions, null/empty/large/concurrent inputs, error paths, and idempotency. Confirm the diff was not tested only against the happy path the author imagined. +7. **Check code-quality dimensions.** Compactly assess design, functionality, complexity, tests, naming, comments, and style as issue, OK, or not applicable based on the diff and surrounding code. Do not invent findings just to fill a dimension. +8. **Check responsibility and surface.** Confirm changed files fit the user's stated scope or local ownership info. Files touched outside the stated scope need an explicit reason or get flagged as out-of-scope. +9. **Check public-surface and contract impact.** Identify breaking changes to APIs, schemas, configs, on-disk formats, events, or shared modules. Confirm consumer impact has been considered. +10. **Check operational artifacts.** Identify missing rollback path, missing telemetry for new behavior, missing runbook update, missing migration safety, missing SLO/error-budget consideration, missing threat consideration for new trust-boundary changes, and missing docs. +11. **Classify findings.** For each finding, record category, support (file:line or behavior), recommended next action, and risk level (blocker, must-fix-before-merge, follow-up, or accepted with rationale). +12. **Use specialist lenses when needed.** If security, database, rollout, observability, accessibility, or contract-evolution concerns dominate, apply that narrower skill rather than expanding this review. +13. **Produce the structured artifact.** Output a single review with the categories below, not running prose. The user can use this and can act without re-reading the diff. + +## Synthesized Default + +Use a structured pre-merge review pass: verify stated intent matches actual diff, check that changed behavior is exercised by a test that would fail without the change, scan for hallucinated APIs and deleted-but-used code, classify scope creep, and require file/line support plus next action for every blocker. Treat any author or agent self-summary as a hypothesis, not a finding. Use narrower specialist skills only as internal lenses when their surface dominates. + + + +## Phase Behavior + +- Ideation: do not use this specialist for risks or options before code exists; route pre-code risk shaping to the appropriate design, security, rollout, test, API, data, or architecture specialist. +- Design: do not use this specialist for tradeoffs or checks unless a concrete diff, branch, or patch already exists. +- Development: use only after development sequencing produces a diff or change set that needs pre-merge checks and review. +- Testing: evaluate tests and failure details attached to an existing diff; route test strategy before code exists to the testing specialist. +- Release: evaluate pre-merge release, rollout, and rollback details attached to the diff. +- Maintenance: use only when a maintenance change has owners, drift context, and a concrete diff, branch, PR, or change set. +- Existing artifact: evaluate an existing diff, branch, PR, or change set as context for the pre-merge engineering decision; do not use this skill without the concrete change artifact. +- Missing details: ask for the diff, task, assumptions, and test results; say what to check next and do not invent findings against unseen code. + +## Exceptions + +- Throwaway prototypes isolated from production may use a lighter pass focused on hallucinated APIs and unmotivated edits. +- Mechanical or generated changes may use sample review plus a non-regression check rather than line-by-line review, when the generator and pattern are maintained and verified. +- Emergency fixes may merge with a documented blocker list, explicit user risk acceptance, and an immediate post-merge review and rollback plan. +- Diffs already checked once may use this skill to verify failure modes a routine review would not have looked for. + +## Response Quality Bar + +- Lead with the structured review artifact, blocker list, or scope-creep finding requested. +- Start the artifact with an `Review anchors` line containing at least two changed `file:line` citations when the diff has two or more changed lines; one anchor may support intent, but blocker and must-fix findings still need separate cited support. +- Cover intent verification, failure-mode pass, behavior verification, responsibility/scope details, public-surface impact, and missing operational artifacts before optional review breadth. +- Include a compact code-quality dimensions pass that explicitly covers design, functionality, complexity, tests, naming, comments, and style with issue, OK, or not applicable status tied to the diff. +- Make findings actionable with file/line support, recommended next action, and risk classification; do not produce vibes-only review. +- Include at least two concrete diff anchors when the diff has enough changed lines: file:line citations, file:function references, or short quoted code excerpts. One anchor may support intent reconstruction; blocker and must-fix findings still need separate support. +- Name the details to inspect, such as the diff itself, the originating task or prompt, the test results, and the author's stated summary; do not state findings against unseen code. +- Stay technology-agnostic by default: do not introduce provider, product, framework, database, protocol, or command names unless the user supplied them or explicitly requested tool-specific guidance. +- Stay inside pre-merge review of a single diff. Route security depth, database migration depth, rollout safety, accessibility, and contract evolution to their responsible specialists rather than absorbing them here. +- Be concise: prefer a single structured artifact with categorized findings over running narrative. + +## Required Outputs + +- Review anchors: at least two changed `file:line` citations from the diff, unless the diff itself has fewer than two changed lines. +- One-sentence reconstructed intent and one-sentence assessment of whether the diff matches it, anchored to at least one changed file, function, or line when available. +- Explicit merge verdict: ready to merge, request changes, or block, with reasons tied to observed issues or their absence. +- Code-quality dimensions summary covering design, functionality, complexity, tests, naming, comments, and style, each marked issue, OK, or not applicable with brief support or reason. +- Categorized findings table with category, support (file/line or behavior), recommended next action, and risk level. +- Blocker list: changes that must not merge as-is, each with file/line support and next action. +- Failure-mode findings covering silent assumptions, plausible-but-wrong logic, hallucinated APIs, deleted-but-used code, unmotivated edits, missing edge cases, and scope creep. +- Missing-artifact list across rollback path, telemetry for new behavior, runbook updates, migration safety, threat consideration for new trust boundaries, and docs. +- Behavior-exercise summary stating which changed behaviors have a failing-without-the-change test and which do not. +- Specialist follow-up routes, capped and prioritized. +- Risk classification per finding (blocker, must-fix-before-merge, follow-up, accepted with rationale and user confirmation). + +## Checks Before Moving On + +- `intent_match`: stated intent is restated and compared to the actual diff; scope creep is named when present. +- `failure_mode_pass`: silent assumptions, hallucinated APIs, deleted-but-used code, unmotivated edits, and missing edge cases have each been considered explicitly. +- `behavior_exercised`: every changed behavior is tied to a test or an explicit unverified-behavior finding. +- `quality_dimensions`: design, functionality, complexity, tests, naming, comments, and style have each been explicitly addressed or marked not applicable with a diff-based reason. +- `finding_support`: every finding points to a file, line, or behavior and has a recommended next action. +- `risk_classified`: every finding has a risk level and a recommended next action. +- `surface_check`: public-surface, contract, schema, config, event, and shared-module impact has been addressed or marked not applicable with reason. +- `artifact_check`: missing rollback, telemetry, runbook, migration safety, threat consideration, and docs are listed when relevant. +- `diff_anchors`: final review includes at least two changed file:line citations, or states that the diff has fewer than two changed locations. + +## Red Flags - Stop And Rework + +- The review trusts the author's self-summary instead of checking the diff. +- Findings are stated as opinions without file/line or behavior support. +- New behavior is accepted because tests pass, without confirming any test would fail without the change. +- Deletions are accepted without checking for remaining callers, imports, or references. +- Out-of-scope file changes are merged because they "look harmless." +- Hallucinated APIs, types, or imports are not checked even though the author (human or AI) could have invented them. +- Specialist concerns (security, migration, rollout) are absorbed into this review instead of routed to the responsible specialist. +- The review produces prose only, with no categorized findings, support, next actions, or risk levels. +- The final verdict is given with fewer than two changed `file:line` review anchors when the diff contains enough changed lines. + +## Common Mistakes + +| Mistake | Correction | +| --- | --- | +| Reviewing the author's narration | Review the diff against the originating task, not the self-summary. | +| Treating green tests as verification | Confirm a test exists that would fail without the change. | +| Reviewing line-by-line without intent | Group changes by purpose and check each group against stated intent. | +| Ignoring deletions | Search for remaining callers, imports, references, and tests of removed code. | +| Accepting plausible APIs at face value | Confirm imports, types, and external calls actually exist in the target environment. | +| Letting scope creep slide | Name out-of-scope edits and require justification or removal. | +| Skipping code-quality dimensions | Compactly cover design, functionality, complexity, tests, naming, comments, and style as part of the review artifact. | +| Doing the specialist's work here | Route security, migration, rollout, accessibility, and contract concerns to the responsible specialist. | +| Producing vibes review | Output a structured artifact with categories, support, next actions, and risk levels. | +| Giving a verdict before pinning support | Cite at least two changed `file:line` anchors first, then make the merge decision. | diff --git a/plugins/sirmarkz/staff-engineer-mode/specialists/ai-coding-governance.md b/plugins/sirmarkz/staff-engineer-mode/specialists/ai-coding-governance.md new file mode 100644 index 00000000..c60f76f4 --- /dev/null +++ b/plugins/sirmarkz/staff-engineer-mode/specialists/ai-coding-governance.md @@ -0,0 +1,124 @@ +--- +name: ai-coding-governance +description: "Use when setting repo rules for AI coding agents: allowed actions, protected paths, data boundaries, verification" +--- + +# AI-Assisted Coding Controls + +## Iron Law + +``` +NO AI-ASSISTED CHANGE WITHOUT SCOPE, VERIFICATION DETAILS, DATA BOUNDARY, AND RESPONSIBLE ACCEPTANCE +``` + +If a coding agent cannot explain what it changed, why, how it was verified, and what data it touched, the change is not acceptably traceable. + +## Overview + +Produces a repo-local rule set for coding agents: allowed and forbidden actions, protected paths, sensitive-data and secret boundaries, required verification results, and traceability tied to the user and local details. Catches the moment when an agent rewrites twelve files at 11pm with no test run, no scope statement, and no accountability for the diff. + +**Core principle:** give coding agents explicit repo rules, constrain sensitive data and actions, require user-visible verification, and make generated changes meet the same bar as human changes. + +## When To Use + +- The user is designing coding-agent instructions, AI assistant repo rules, generated-code acceptance checks, protected paths, or AI coding rules as engineering controls. +- You want agents to follow repository practices without leaking data, skipping tests, or making anonymous changes. +- AI-generated changes affect production code, infrastructure, tests, docs, migrations, or release artifacts. +- The question is how to make agent output traceable, bounded, and safe during development. + +## When Not To Use + +- The request is per-PR, per-diff, or per-change pre-merge review ("review this PR before merge," "what did my agent miss here," "is this branch safe to merge") for any diff regardless of authorship; use `agent-pr-review`. This skill covers org-level and repo-level controls: allowed and forbidden actions, protected paths, secret and data boundaries, traceability, and the rules any diff must satisfy. `agent-pr-review` covers the senior review pass on a specific diff against those rules. +- The main risk is prompt injection, tool access, retrieval, or deployed LLM app behavior; use `llm-application-security`. +- The main issue is model eval harness design, graders, or regression checks for an LLM workflow; use `llm-evaluation`. +- The request is generic review routing, responsibility, change-size policy, or workflow metrics for human and agent code together, with no AI-agent control decision. +- The request is broad AI ethics, legal rules, procurement, or staffing; out of scope. +- The task is ordinary code review with no AI-assisted workflow concern; use `agent-pr-review` only when there is a concrete diff. + +## Info To Gather + +- Current work phase, next decision, what is known, and assumptions where details are missing. +- Agent capabilities, allowed actions, repo instructions, protected paths, and responsibility rules. +- Sensitive data boundaries, secrets handling, dependency rules, and generated-content restrictions. +- Required verification, acceptance checks, change history, commit hygiene, and release checks. +- Existing failure modes: hallucinated APIs, unbounded rewrites, skipped tests, broad diffs, or leaked context. +- Exception path for emergency fixes, prototypes, and low-risk generated assets. + +## Workflow + +1. **Scope the agent.** Define allowed tasks, forbidden actions, protected files, and selection rules. +2. **Set repo instructions.** Encode coding style, testing, security, data handling, dependency, and release expectations in agent-readable guidance. +3. **Protect data.** Prevent agents from exposing secrets, sensitive records, private logs, or unnecessary user data. +4. **Require small explainable diffs.** Keep changes small, explain intent, preserve responsibility, and separate mechanical edits from behavior changes. +5. **Demand records.** Require tests, validation output, static checks, or explicit limitations before accepting agent changes. +6. **Handle dependencies carefully.** New dependencies need purpose, update path, license/security rationale where applicable, and removal plan if experimental. +7. **Trace agent work.** Track prompts, tool actions, changed files, verification, and explicit user confirmation where production risk exists. +8. **Tune the rules.** Convert repeated agent mistakes into clearer instructions, tests, or automated checks. + +## Synthesized Default + +Use repo-local agent instructions, least-privilege tool access, protected-path rules, sensitive-data boundaries, small diffs, mandatory verification results, and human responsibility for production changes. Treat AI-generated code as untrusted until tests, checks, and source-specific details show it fits the system. + + + +## Phase Behavior + +- Ideation: identify risks, defaults, unknowns, options, and the next decision before code exists. +- Design: shape the target artifact, tradeoffs, checks, and details to gather. +- Development: guide sequencing, code boundaries, checks, and acceptance criteria. +- Testing: define release-blocking tests, evals, fixtures, and failure probes. +- Release: define rollout, observability, abort, rollback, and readiness details. +- Maintenance: define owners, drift checks, cleanup triggers, and refresh cadence. +- Existing artifact: use current code, docs, telemetry, incidents, or diffs as context for the next engineering decision; do not wait for a finished artifact before guiding design, build, release, or operation. +- Missing details: state assumptions and say what to check next instead of blocking lifecycle guidance. + +## Exceptions + +- Throwaway prototypes can use lighter checks only when isolated from production code, data, and release paths. +- Mechanical edits may use sampled checks if deterministic and backed by non-regression checks. +- Emergency agent-assisted fixes may proceed faster with explicit user confirmation and immediate post-fix result capture. + +## Response Quality Bar + +- Lead with the control rule, repo-instruction change, acceptance check, or risk finding requested. +- Cover scope, responsibility, data boundaries, verification, tests, dependency rules, traceability details, and exceptions before optional operational detail. +- Make recommendations actionable with protected paths, allowed actions, required verification, user confirmations, and fallback rules where relevant. +- Name the details to inspect, such as agent instructions, diffs, test output, sensitive-data boundary checks, dependency rationale, and confirmation records; do not state details you have not seen. +- Stay technology-agnostic by default: do not introduce provider, product, framework, database, protocol, or command names unless the user supplied them or explicitly requested tool-specific guidance. +- Stay inside AI-assisted development controls. Use deployed LLM security or eval-harness skills only when that surface is the central risk. +- Be concise: prefer enforceable repo rules and checks over broad AI statements. + +## Required Outputs + +- AI-assisted coding rule set for the repo or change. +- Allowed and forbidden agent actions. +- Sensitive-data and secret-handling boundaries. +- Verification and acceptance checks for agent changes. +- Dependency and generated-content acceptance rules. +- Traceability checklist. +- Exception rule with user confirmation and expiry. + +## Checks Before Moving On + +- `scope_defined`: allowed tasks, forbidden actions, and protected paths are explicit. +- `data_boundary`: secrets, sensitive records, and private context handling are addressed. +- `small_diff`: changes are small enough to understand and tied to a user-visible change trail. +- `verification_required`: tests or validation results are required before acceptance. +- `work_record`: prompt, action, diff, checks, and confirmation are linked where risk warrants. + +## Red Flags - Stop And Rework + +- Agent output is accepted because it looks plausible. +- The agent rewrites unrelated files without explicit user confirmation. +- Sensitive logs, secrets, or user data are pasted into prompts unnecessarily. +- New dependencies appear with no rationale, update path, or removal plan. +- Verification is described but not actually run. + +## Common Mistakes + +| Mistake | Correction | +| --- | --- | +| Rules as prose | Put rules where agents and acceptance checks will use them. | +| Trusting generated code | Require tests, checks, and records. | +| Unlimited agent scope | Define protected paths and user-confirmation triggers. | +| No learning loop | Convert repeated failures into rules or checks. | diff --git a/plugins/sirmarkz/staff-engineer-mode/specialists/api-design-and-compatibility.md b/plugins/sirmarkz/staff-engineer-mode/specialists/api-design-and-compatibility.md new file mode 100644 index 00000000..6c8a848e --- /dev/null +++ b/plugins/sirmarkz/staff-engineer-mode/specialists/api-design-and-compatibility.md @@ -0,0 +1,144 @@ +--- +name: api-design-and-compatibility +description: "Use when designing new API contracts, endpoints, SDK surfaces, or changing exposed behavior and client compatibility" +--- + +# API Design And Compatibility + +## Iron Law + +``` +NO API CONTRACT WITHOUT COMPATIBILITY, ERROR, IDEMPOTENCY, AND EVOLUTION RULES +``` + +If current or future clients cannot tell what the contract means, how errors behave, whether retries are safe, or how the API can evolve, it is not ready. + +## Overview + +An API is a long-lived contract with current or future clients, retries, partial failures, and migration lag. + +**Core principle:** make contracts explicit, evolvable, retry-safe, observable, and compatible by default. + +## When To Use + +- The user is designing or changing API behavior, service contracts, operation names, generated-client shape, versioning, compatibility, deprecation, pagination, filtering, batch operations, error models, idempotency, or client migration. +- A new system, service, endpoint, SDK surface, or interservice contract is being built and needs a client-facing contract before launch. +- A change adds, removes, renames, retypes, or changes semantics of fields, operations, defaults, errors, events, or resources exposed to another component or client. +- The user asks whether an endpoint, schema, interface, or service contract can evolve safely. +- A retryable mutating operation needs idempotency behavior. + +## When Not To Use + +- The data model is purely internal and is not exposed, or planned to be exposed, through an interface. +- The main issue is per-call timeout/retry behavior rather than API contract; use `dependency-resilience` instead. +- The request is broad secure design; use `secure-sdlc-and-threat-modeling` instead unless API contract is central. +- The request is event schema evolution inside an asynchronous workflow; use `event-workflows` instead unless the external API contract is the main surface. + +## Info To Gather + +- Current work phase, next decision, what is known, and assumptions where details are missing. +- Planned or existing consumers, client release cadence, compatibility expectations, and deprecation tolerance. +- For new APIs, intended consumer classes and discovery path; for existing APIs, known consumers and impact signals. +- Operations/resources, generated-client method shape, request and response fields, event shapes, status/error semantics, defaults, and side effects. +- Authentication, authorization, rate limits, quotas, tenant context, activity-log needs, and abuse cases. +- Retry behavior, idempotency needs, duplicate suppression, and replay windows. +- Pagination, filtering, ordering, sorting, cursor stability, and consistency expectations. +- Versioning policy, launch evolution rules, migration telemetry where clients already exist, usage by client/version, and existing deprecation process. + +## Workflow + +1. **Define the contract boundary.** State who consumes the API, whether it is public or interservice, what compatibility promise exists, and which behaviors are observable by clients. +2. **Model operations and resources.** Use customer-domain terms, one clear action per operation, stable resource names, and request/response shapes that generate readable client methods. +3. **Classify the contract surface.** For new APIs, mark each field, operation, error, default, enum, and semantic rule as a launch-time contract commitment. For existing APIs, mark each change as compatible, conditionally compatible, or breaking. +4. **Prefer additive evolution.** Add optional fields, new operations, new enum values with tolerant readers, and new versions only when needed. +5. **Design error semantics.** Use a small stable error surface with machine-readable categories, typed programmatic fields, human-readable detail, retryability, correlation identifiers, and safe redaction. +6. **Make retries safe.** For mutating operations that clients may retry, require idempotency keys, operation identifiers, or dedupe semantics. Scope dedupe state to the caller and request parameters, expire it deliberately, and ensure duplicate retries create no side effects. +7. **Handle collections deliberately.** Prefer stable cursor-style pagination for mutable collections; define ordering, filtering, empty results, cursor-token expiration, and list item summaries that avoid needless follow-up calls. +8. **Bound filters and payloads.** Keep filters explicit, bounded, commutative, and limited to fields the caller may see; define unknown, malformed, duplicate, and over-limit behavior. Publish maxima for variable inputs, payloads, and inner lists at launch. +9. **Shape batch operations intentionally.** Use batch APIs only for repeated same-action work. Shape each item like the singular operation, include per-item correlation, separate successes from errors, define partial-success behavior, and reject whole invalid batches before attempting items. +10. **Plan evolution.** For new APIs, define how the contract can add fields, operations, enum values, limits, and versions later, plus how intended consumers will discover and adopt it. For existing APIs, use telemetry to identify clients, publish deprecation windows, support overlap, and define removal checks. +11. **Check security and abuse.** Include authorization, rate limits, tenant isolation, audit events, and input validation as part of the contract. + +## Synthesized Default + +Design APIs around domain contracts and generated-client ergonomics, not internal storage shape. Use additive compatibility first and explicit versions only when semantics must break. Mutations that can be retried need idempotency. Lists, filters, batches, and unbounded inputs need explicit limits and stable semantics at launch. Errors should be structured, stable, safe to expose, and tied to retry behavior. New APIs need evolution rules before launch; deprecation requires telemetry, migration support, and a removal check. + + + +## Phase Behavior + +- Ideation: identify risks, defaults, unknowns, options, and the next decision before code exists. +- Design: shape the target artifact, tradeoffs, checks, and details to gather. +- Development: guide sequencing, code boundaries, checks, and acceptance criteria. +- Testing: define release-blocking tests, evals, fixtures, and failure probes. +- Release: define rollout, observability, abort, rollback, and readiness details. +- Maintenance: define owners, drift checks, cleanup triggers, and refresh cadence. +- Existing artifact: use current code, docs, telemetry, incidents, or diffs as context for the next engineering decision; do not wait for a finished artifact before guiding design, build, release, or operation. +- Missing details: state assumptions and say what to check next instead of blocking lifecycle guidance. + +## Exceptions + +- Internal APIs with one deployable client may use tighter migration windows, but still need compatibility during rollout. +- A breaking change is acceptable when security, correctness, or unsustainable complexity justifies it and a migration plan exists. +- Cursor pagination may be unnecessary for immutable or tiny bounded collections. +- Protocol-specific conventions may shape syntax, naming style, and transport status, but the compatibility, idempotency, error, and migration rules still apply. + +## Response Quality Bar + +- Lead with the concrete decision, blocker list, or migration plan requested. +- Cover all compatibility, error, idempotency, and migration risks before optional API topics. +- Make recommendations actionable with checks, stop conditions, and removal criteria where relevant. +- Name the details to inspect, such as client telemetry, version usage, retry behavior, and migration readiness; do not state details you have not seen. +- Stay technology-agnostic by default: do not introduce provider, product, framework, database, protocol, or command names unless the user supplied them or explicitly requested tool-specific guidance. +- Stay inside the API surface. Mention pagination, rate limits, auth, audit, or tenant controls only when the prompt or risk makes them material. +- Be concise: avoid generic API background and prefer compact compatibility matrices or checklists. +- For naming or shape decisions, provide concrete operation/resource names, generated-client ergonomics notes, and compatibility rationale. +- For PR, release-note, or copy-polish requests that hide contract changes, decide safety before wording. If the contract is unsafe, lead with the blocker and give corrected release-note constraints only after the compatibility and idempotency fixes. +- Keep narrow answers bounded to one decision, the material blockers, and the minimum contract changes needed to make the rollout safe. + +## Required Outputs + +- API contract decision with planned or existing consumers, compatibility class, and risks. +- Consumer discovery or impact plan: intended consumer classes for new APIs, known-consumer signals for existing APIs. +- Operation/resource naming decision and generated-client ergonomics notes. +- Compatibility and evolution matrix for each new or changed operation, field, default, enum, event, error, and status behavior. +- Versioning and deprecation plan with launch evolution rules, telemetry where available, and removal checks. +- Error model with retryability, correlation, redaction, and client action. +- Idempotency policy for retryable mutations. +- Pagination, filtering, ordering, bounded-input, batch, and rate-limit policy. +- Security and audit requirements for the exposed surface. + +## Checks Before Moving On + +- `compatibility_class`: every new contract element is marked as a launch-time commitment, and every contract change is classified as additive, compatible, conditionally compatible, or breaking. +- `operation_shape`: operations have one customer-visible action, stable resource terms, generated-client readability, and explicit side effects. +- `idempotency_policy`: retryable mutations have an idempotency or dedupe design. +- `error_model`: errors define machine code, human detail, retryability, correlation, and safe disclosure. +- `collection_contract`: lists and filters define pagination, ordering, empty results, field visibility, bounds, token stability, and expiration. +- `batch_semantics`: batch APIs define item limits, item correlation, partial success, per-item errors, and whole-request rejection rules. +- `consumer_discovery`: new APIs define intended consumer classes and discovery path; existing APIs identify known consumers or the telemetry gap. +- `evolution_plan`: new APIs have rules for future compatible additions, and deprecation or breaking changes have client usage telemetry and removal criteria. +- `abuse_boundary`: authz, rate limits, tenant context, activity logging, and validation are addressed where relevant. + +## Red Flags - Stop And Rework + +- "Only internal clients use it" is used to skip compatibility while clients deploy independently. +- A field is repurposed with new semantics instead of adding a new field or version. +- Operation names expose implementation steps, combine unrelated actions, or generate confusing client methods. +- Errors are free-form strings with no retryability or client action. +- Mutating operations are retryable but not idempotent. +- A list, filter, or batch API ships without bounds, collection traversal semantics, or partial-failure behavior. +- Filters expose fields the caller cannot otherwise inspect. +- Deprecation depends on guessing client usage instead of telemetry. + +## Common Mistakes + +| Mistake | Correction | +| --- | --- | +| Versioning every change | Prefer additive compatible changes; reserve versions for semantic breaks. | +| Treating generated clients as an afterthought | Decide operation names and shapes as part of the public contract. | +| Treating status codes as the error model | Include stable application error codes and retry guidance. | +| Offset pagination on mutable data | Use stable cursors when inserts/deletes can shift results. | +| Retrofitting bounds after launch | Set list, filter, batch, payload, and processing limits before clients depend on them. | +| Hiding per-item batch failures | Echo request identifiers and separate successes from errors. | +| Ignoring slow clients | Plan overlap, telemetry, and explicit removal checks. | diff --git a/plugins/sirmarkz/staff-engineer-mode/specialists/architecture-decisions.md b/plugins/sirmarkz/staff-engineer-mode/specialists/architecture-decisions.md new file mode 100644 index 00000000..0880892c --- /dev/null +++ b/plugins/sirmarkz/staff-engineer-mode/specialists/architecture-decisions.md @@ -0,0 +1,146 @@ +--- +name: architecture-decisions +description: "Use when making system design decisions, ADRs, service/module/worker boundaries, or architecture tradeoffs" +--- + +# Architecture Decisions And Decision Records + +## Iron Law + +``` +NO ARCHITECTURE DECISION WITHOUT FORCES, ALTERNATIVES, AND COST TO CHANGE COURSE +``` + +If the design lacks goals, constraints, alternatives considered, and a clear read on what would make you change course later, do not treat it as decided. For solo work, responsibility can simply name who runs the local checks and keeps the decision current. + +## Overview + +Architecture decision work turns "components and opinions" into explicit goals, tradeoffs, failure modes, and decisions future readers can understand. Works the same at any project size: the discipline is forces, alternatives, and cost to change course, not the formal process around it. Shape decisions by the forces they must satisfy: user outcomes, constraints, data, reliability, security, operability, evolvability, and cost. + +## When To Use + +- Making, shaping, or revisiting system design decisions, RFCs/design docs, ADRs, service boundaries, dependency direction, or tradeoff analysis. +- A change affects data responsibility, public contracts, reliability, deployment topology, security boundaries, or operational responsibility. +- Whether a monolith, module, service, workflow, platform component, or integration boundary "holds up". +- A prior decision needs to be recorded or revisited with current constraints. + +## When Not To Use + +- Live outage handling: use `incident-response-and-postmortems`. +- Code style, naming, formatting, or local implementation review: use `agent-pr-review` only for a concrete diff; use `code-readability-for-agents` when repository legibility is the design problem. +- Launch readiness aggregation: use `production-readiness-review`. +- Narrow API compatibility issue: use `api-design-and-compatibility`. + +## Info To Gather + +- Current work phase, next decision, what is known, and assumptions where details are missing. +- Problem statement, users, goals, non-goals, constraints, success criteria. +- Current and proposed architecture, data flows, trust boundaries, interfaces, dependencies, and runtime responsibility model. +- Critical-path storage and runtime dependency choices, including latency, availability, failover, coupling, alternatives, and reversal or isolation plan. +- Operability notes: how the user or agent debugs, replaces, or degrades around the design, what the fallback path is, and where that path is tested or documented. +- Alternatives considered, including "do nothing", "keep modular", "split later". +- Reliability, security, privacy, deploy, data consistency, migration, operational risks. +- Existing incidents, SLOs, costs, scale limits, compliance constraints, roadmap pressures. + +## When Inputs Are Referenced But Not Visible + +If the user references an artifact (sketch, RFC, diff, diagram) that is not in +the workspace or thread, do not stop at "please paste it." Produce a +strawman ADR draft from the prompt's named subject (e.g., "split search +service") with: (a) a Forces table listing ≥2 likely forces with rationale, +(b) an Alternatives table with ≥2 named options and rejection reasons, +(c) an explicit Decision line plus a Consequences table split into Positive / +Negative columns, (d) a Reversibility row with cost-to-undo and the trigger +that would force reconsideration, and (e) a responsibility field. Mark every +inferred field as ASSUMED so the user can correct it. + +## Workflow + +1. **Frame the decision.** Write the decision as one clear question and list goals, non-goals, and constraints before evaluating solutions. +2. **Emit a compact ADR-shaped first answer.** Before asking for more artifacts, give the user a usable decision skeleton containing: decision question, context/forces with rationale, explicit decision status or decision, at least two rejected alternatives with reasons, positive and negative consequences, reversibility cost and reconsideration trigger, and responsibility owner or check path. Mark unknowns as `ASSUMED` or `NEEDS CHECK` instead of omitting the section. +3. **Map the system.** Identify data flow, control flow, dependency direction, trust boundaries, failure domains, and operational checkpoints. +4. **Map bounded contexts.** Produce a bounded-context map naming each context, its responsibility owner or check path, the language/model it uses, and the relationship to every adjacent context (upstream/downstream, conformist, anti-corruption layer, shared kernel, partnership, customer/supplier, separate ways). Note where a context translates a neighbor's model and where it conforms. +5. **Prefer simpler boundaries first.** Start with modular design and explicit contracts. Add distribution only for independent scaling, release cadence, responsibility, isolation, or blast-radius needs. +6. **Compare alternatives.** Evaluate at least two real options plus the current state. Include consequences, rejected alternatives, and what would make the decision wrong later. +7. **Specify fitness functions.** Write the architectural invariants the system must hold as testable checks. Each fitness function names: the property under test, the metric, the threshold or rule, the measurement source, the evaluation cadence, the failure response, and the local check path. Cover at minimum the dependency-direction rules, the public-contract compatibility rules, the latency or throughput budgets the boundary depends on, and any blast-radius or isolation invariant the design relies on. +8. **Evaluate runtime dependency responsibility.** For any critical runtime dependency or storage choice, state how the user or agent can debug it, patch or change it, work around issues, isolate or reverse the decision, and exit or degrade if it fails. Keep this at design-time adoption criteria; timeout/retry policy goes to `dependency-resilience`, and launch details go to `production-readiness-review`. +9. **Evaluate cross-cutting risks.** Cover reliability, overload, data correctness, security, observability, deployment safety, recovery, cost, and maintainability. +10. **Record the decision.** Create an ADR or design-decision summary with status, context (>=2 forces with rationale), decision, consequences (split positive and negative), reversibility (cost + reconsideration trigger), supporting details, fitness-function references, and follow-up checks. +11. **Use specialist checks internally.** Apply the SLO, HA, dependency resilience, secure design, rollout, or data consistency skill when the design exposes that surface. + +## Synthesized Default + +Use a compact design decision plus ADR. Keep the system modular and technology-agnostic until the design shows it needs distribution. When distribution is justified, make responsibility, contracts, failure modes, observability, and deployability explicit before endorsing the split. + + + +## Phase Behavior + +- Ideation: identify risks, defaults, unknowns, options, and the next decision before code exists. +- Design: shape the target artifact, tradeoffs, checks, and details to gather. +- Development: guide sequencing, code boundaries, checks, and acceptance criteria. +- Testing: define release-blocking tests, evals, fixtures, and failure probes. +- Release: define rollout, observability, abort, rollback, and readiness details. +- Maintenance: define owners, drift checks, cleanup triggers, and refresh cadence. +- Existing artifact: use current code, docs, telemetry, incidents, or diffs as context for the next engineering decision; do not wait for a finished artifact before guiding design, build, release, or operation. +- Missing details: state assumptions and say what to check next instead of blocking lifecycle guidance. + +## Exceptions + +- Exploratory prototypes can use a lightweight decision note if explicitly non-production and disposable. +- Regulated, security-sensitive, or tier-1 systems need a fuller risk register and change trail. +- Reversible local implementation choices may be documented in code or PR context instead of an ADR. +- If the system is already failing operationally, incident or reliability work may precede full architecture cleanup. + +## Response Quality Bar + +- Lead with the architecture decision, decision status, or highest-severity blockers. +- Cover goals, alternatives, responsibility, boundaries, data flow, and failure modes before optional architecture breadth. +- Make recommendations actionable with checks, stop conditions, and follow-up decisions. +- Name the details to inspect, such as SLOs, traffic, incidents, data contracts, threat boundaries, and migration checks; do not state details you have not seen. +- Stay technology-agnostic by default: do not introduce provider, product, framework, database, protocol, or command names unless the user supplied them or explicitly requested tool-specific guidance. +- Stay inside the design or decision. Add at most two specialist follow-ups, only for material unresolved surfaces. +- Be concise: prefer compact ADRs, decision tables, and risk registers over generic architecture theory. +- For pre-build, ticketing, or milestone-readiness requests, distinguish implementation tasks from unresolved architecture decisions. Use compact decision, risk/tradeoff, alternative, responsibility, and check tables; do not expand into a full narrative ADR unless asked. + +## Required Outputs + +- Architecture decision summary with context, goals, non-goals, and constraints. +- ADR with status, decision, alternatives, consequences, and a concrete responsibility value (user, local check path, or supplied project role; if unknown, use `ASSUMED: responsibility` rather than a blank or `TBD`). +- System map covering data flow, dependencies, trust boundaries, and responsibility. +- Runtime dependency adoption criteria covering supportability, changeability, fallback, and exit/degradation path. +- Critical-path storage or dependency decision entry with forces, alternatives, failure model, and reversal or isolation path. +- Bounded-context map listing each context with fields: name, responsibility owner or check path, model/language, upstream contexts, downstream contexts, relationship to each neighbor (conformist, anti-corruption layer, shared kernel, partnership, customer/supplier, separate ways), and the translation surface where a neighbor's model is adapted. +- Fitness-function specification listing each architectural invariant with fields: property under test, metric, threshold or rule, measurement source, evaluation cadence, failure response, and local check path. Cover dependency-direction rules, public-contract compatibility, latency or throughput budgets the boundary depends on, and any blast-radius or isolation invariant. +- Risk register with likelihood, impact, mitigation, and records. +- Decision table showing default, alternatives rejected, and exception conditions. +- Follow-up checks capped at two, each tied to a specific unresolved surface. + +## Checks Before Moving On + +- `decision_record`: the ADR states context, decision, status, alternatives, and consequences. +- `goal_alignment`: every recommended architecture element maps to a goal, constraint, or risk. +- `boundary_check`: service/module boundaries have responsibility, contracts, data responsibility, and failure behavior. +- `context_map`: every named context has a model, upstream and downstream neighbors, and the relationship pattern to each neighbor; translation surfaces are explicit where neighbors disagree on the model. +- `fitness_functions`: every architectural invariant the design depends on has a property, metric, threshold or rule, measurement source, evaluation cadence, failure response, and local check path; vague "should be fast" or "should be loosely coupled" entries are rejected as not testable. +- `risk_coverage`: reliability, security, data, deploy, observability, and operations risks are considered. +- `dependency_responsibility`: critical runtime dependencies have supportability, change path, fallback path, and exit or degradation plan. +- `critical_path_tradeoff`: critical-path storage and dependency choices state forces, alternatives, failure behavior, and reversal or isolation path. +- `follow_up_cap`: no more than two follow-up skills are recommended unless the output is a sequencing plan. + +## Red Flags - Stop And Rework + +- Components are named without their contracts, data flows, or failure modes. (Solo work: the responsibility value can be "user + local checks"; the rule is no anonymous components, not formal headcount.) +- A distributed design is chosen because it is fashionable, not because constraints require it. +- Alternatives are missing or all alternatives are strawmen. +- The design pushes complexity into operations without on-call responsibility or runbooks. +- A critical runtime dependency is accepted even though the user or agent has no path to debug, change, replace, or degrade around it from local tools and records. +- Security, observability, migration, and rollback are left as "implementation details" for a high-risk decision. + +## Common Mistakes + +| Mistake | Correction | +| --- | --- | +| Treating diagrams as decisions | Record the decision, forces, consequences, and responsibility. | +| Approving distribution too early | Prefer modular boundaries until scale, responsibility, release, or blast-radius needs justify it. | +| Hiding rejected options | State what was rejected and why, so future readers do not repeat the debate. | diff --git a/plugins/sirmarkz/staff-engineer-mode/specialists/backup-and-recovery.md b/plugins/sirmarkz/staff-engineer-mode/specialists/backup-and-recovery.md new file mode 100644 index 00000000..8a5bb944 --- /dev/null +++ b/plugins/sirmarkz/staff-engineer-mode/specialists/backup-and-recovery.md @@ -0,0 +1,127 @@ +--- +name: backup-and-recovery +description: "Use when setting RTO/RPO, backup design, restore tests, or recovery from corruption, deletion, or site loss" +--- + +# Backup Restore And Disaster Recovery + +## Iron Law + +``` +NO RECOVERY PLAN WITHOUT A TESTED RESTORE AND MEASURED RTO/RPO +``` + +A successful backup job is not a restore test. Replication is not a backup. Multi-location serving does not show that data can be recovered. + +## Overview + +Backups do not matter until a restore works. + +**Core principle:** define recoverability by RTO/RPO and check it with restore tests under realistic failure scenarios, including destructive operators and corrupted data. + +## When To Use + +- The user asks about backups, restores, disaster recovery, RTO, RPO, PITR, immutable backups, location recovery, ransomware recovery, or destructive data changes. +- A stateful launch or PRR needs recovery results. +- The system must recover from corrupted rows, accidental deletion, bad migrations, lost keys, location-wide loss, or compromised operators. +- The user asks which DR strategy to use: backup/restore, pilot light, warm standby, or active-active. + +## When Not To Use + +- The main goal is serving through fault-domain loss without restoring data; use `high-availability-design` instead. +- The request is normal unit/integration testing. +- The issue is online schema/backfill execution before disaster occurs; use `database-operations` instead. +- A live outage needs command, communications, and mitigation; route to `incident-response-and-postmortems` alongside this skill. + +## Info To Gather + +- Current work phase, next decision, what is known, and assumptions where details are missing. +- Essential and critical data sets, customer journeys, data classification, and deletion/corruption blast radius. +- RTO/RPO expectations by journey, tenant, data class, and regulatory/customer commitment. +- Backup method, cadence, retention, location, encryption, key responsibility, immutability, and access policy. +- Replication topology, lag, consistency model, PITR capability, and location dependencies. +- Restore procedure, last restore test results, restore environment, validation queries, and rehearsal history. +- Destructive scenarios: operator error, ransomware, compromised credentials, bad deploy, bad migration, and key loss. + +## Workflow + +1. **Classify what must be recovered.** Separate essential user-critical data sets from broader serving availability, durability, correctness, and audit/history requirements. +2. **Set RTO/RPO.** Record maximum tolerable downtime and data loss for each critical journey and data set. +3. **Map backup coverage.** Include data, metadata, schema, config, secrets/keys, object stores, queues, indexes, and derived state. +4. **Check isolation.** Ensure backups and keys survive accidental deletion, malicious operator action, account compromise, and ransomware. +5. **Design restore paths.** Include full restore, partial restore, point-in-time recovery, location rebuild, and corruption repair. +6. **Run a restore check.** Restore into a controlled environment, run correctness checks, measure elapsed time and data loss, and record gaps. +7. **Choose DR posture.** Use backup/restore, pilot light, warm standby, active-passive, or active-active based on RTO/RPO, complexity, cost, data residency, and operations maturity. +8. **Feed findings back.** Create blockers for PRR, platform fixes, runbook updates, and future drills. + +## Synthesized Default + +Use recent restore tests tied to RTO/RPO as the default. Protect backups and encryption keys in a separate trust and blast-radius boundary. Prefer the simplest DR strategy that meets RTO/RPO and residency constraints; do not choose active-active unless the serving requirement and operational maturity justify the operational cost. + + + +## Phase Behavior + +- Ideation: identify risks, defaults, unknowns, options, and the next decision before code exists. +- Design: shape the target artifact, tradeoffs, checks, and details to gather. +- Development: guide sequencing, code boundaries, checks, and acceptance criteria. +- Testing: define release-blocking tests, evals, fixtures, and failure probes. +- Release: define rollout, observability, abort, rollback, and readiness details. +- Maintenance: define owners, drift checks, cleanup triggers, and refresh cadence. +- Existing artifact: use current code, docs, telemetry, incidents, or diffs as context for the next engineering decision; do not wait for a finished artifact before guiding design, build, release, or operation. +- Missing details: state assumptions and say what to check next instead of blocking lifecycle guidance. + +## Exceptions + +- Stateless services may document dependency recovery rather than service-local backups. +- Derived indexes or caches may be rebuilt instead of backed up if rebuild time fits RTO and source data is protected. +- Active-active may be required for very low RTO, but it still needs corruption recovery and backup isolation. +- Emergency data repair during an incident may proceed before full DR analysis, but restore checks and postmortem actions must follow. + +## Response Quality Bar + +- Lead with the restore readiness decision, DR strategy, RTO/RPO gap, or blocker list requested. +- Cover backup coverage, retention, encryption/key recovery, isolation, restore runbooks, corruption/PITR/partial restore, validation, and remediation before optional DR breadth. +- Make recommendations actionable with commands, prerequisites, checks, stop criteria, measured targets, and remediation deadlines where relevant. +- Name the details to inspect, such as backup job metadata, restore logs, validation queries, key recovery checks, retention settings, immutable storage controls, and measured RTO/RPO; do not state details you have not seen. +- Stay technology-agnostic by default: do not introduce provider, product, framework, database, protocol, or command names unless the user supplied them or explicitly requested tool-specific guidance. +- Stay inside backup, restore, and DR. Route HA serving design or incident repair only when those are the central unresolved risk. +- Be concise: avoid generic DR taxonomy and prefer compact coverage matrices and restore result tables. + +## Required Outputs + +- DR strategy decision record. +- RTO/RPO table by journey and data set. +- Essential-data coverage table showing source of truth, restore type, validation, and measured result. +- Backup coverage, retention, encryption, key, and immutability matrix. +- Restore runbook with prerequisites, commands, validation, and rollback. +- PITR, partial restore, corruption repair, and location recovery plan. +- Restore test result log with measured RTO/RPO and gaps. +- Remediation backlog for missing coverage or failed restore criteria. + +## Checks Before Moving On + +- `restore_test`: a recent restore test exists, or its absence is called out as a blocker. +- `essential_data_list`: data needed for user-critical recovery is identified separately from lower-criticality copies. +- `rto_rpo_fit`: measured restore time and data loss meet the stated targets, or exceptions have a user-accepted deadline and verification path. +- `measured_restore`: restore behavior is measured against the stated objective rather than described from intent. +- `coverage_matrix`: critical data, metadata, schema, config, and keys have backup or rebuild coverage. +- `isolation_check`: backups and keys are protected from destructive operator, compromised credential, and ransomware scenarios. +- `validation_queries`: restored data has correctness checks, not just process completion. + +## Red Flags - Stop And Rework + +- The only support is "backup job succeeded". +- Replication is treated as protection against accidental deletion or corruption. +- Backups and production data are deletable by the same credentials. +- Encryption keys needed for restore are not backed up, recoverable, or separately protected. +- RTO/RPO is copied from a platform default without measuring restore time. + +## Common Mistakes + +| Mistake | Correction | +| --- | --- | +| Equating HA with DR | HA keeps serving; DR restores lost or corrupted state. | +| Testing full restore only | Include partial restore, PITR, corruption repair, and location rebuild where relevant. | +| Ignoring derived state | Decide whether indexes, caches, search, and analytics are backed up or rebuilt inside RTO. | +| Treating drills as ritual | Capture measured time, data loss, validation results, and remediation patches. | diff --git a/plugins/sirmarkz/staff-engineer-mode/specialists/caching-and-derived-data.md b/plugins/sirmarkz/staff-engineer-mode/specialists/caching-and-derived-data.md new file mode 100644 index 00000000..307392f0 --- /dev/null +++ b/plugins/sirmarkz/staff-engineer-mode/specialists/caching-and-derived-data.md @@ -0,0 +1,129 @@ +--- +name: caching-and-derived-data +description: "Use when designing or changing caches, search indexes, derived values, or materialized views needing freshness rules" +--- + +# Caching And Derived Data + +## Iron Law + +``` +NO CACHE WITHOUT FRESHNESS, INVALIDATION, AND MISS-STORM BEHAVIOR +``` + +If writers, invalidators, readers, and downstream systems are not modeled, the cache can become an outage or data-corruption source. + +## Overview + +Caching is a correctness path disguised as a performance optimization. + +**Core principle:** every cache or derived view needs explicit freshness, invalidation, stampede protection, failure behavior, and observability. + +## When To Use + +- The user is designing, building, changing, or operating a cache, search index, materialized view, or derived-state path and asks about invalidation, TTLs, stale entries, index refresh, cache stampedes, request coalescing, stale-while-revalidate, or derived-state operations. +- A cache miss or cache failure can overload a backing dependency. +- Derived data needs freshness or repair guarantees. +- The user has already decided stale reads are acceptable and needs operational mechanics. + +## When Not To Use + +- The primary question is whether stale reads are semantically acceptable; use `distributed-data-and-consistency` instead. +- The work is primary storage choice or transaction design. +- The issue is warehouse/ETL pipeline freshness; use `data-pipeline-reliability` instead. +- The problem is generic dependency overload without cache mechanics; use `dependency-resilience` instead. + +## Info To Gather + +- Current work phase, next decision, what is known, and assumptions where details are missing. +- Cached objects, keys, writers, invalidators, readers, and responsibility paths. +- Freshness requirement, TTL, negative caching, versioning, and stale-read tolerance. +- Backing dependency capacity, miss amplification, hot keys, and cache population path. +- Failure behavior: cache unavailable, cache cold, invalidation delayed, stale write, partial rebuild. +- Normal hit-rate range, entry size bound, flush or eviction impact, and backing-load increase under cold-cache behavior. +- Stampede controls: request coalescing, leases, single-flight, prewarming, and rate limits. +- Repair path: reindex, rebuild, invalidate all, partial repair, and correctness checks. +- Metrics: hit/miss, stale reads, evictions, rebuild lag, invalidation lag, downstream load, and tail latency. + +## Workflow + +1. **Confirm stale-read semantics.** If not decided, route to distributed data before choosing cache mechanics. +2. **Map the lifecycle.** Identify write, invalidate, fill, read, expire, repair, and rebuild paths. +3. **Set freshness policy.** Define TTL, maximum staleness, validation, version checks, and user-visible behavior. +4. **Protect downstreams.** Model miss amplification and add coalescing, leases, prewarming, or load shedding. +5. **Handle invalidation as correctness.** Use explicit invalidation, versioned values, or repair scans when stale writes can occur. For cache-aside writes, define the source-of-truth update and invalidation order. +6. **Define degradation.** State behavior when cache is cold, unavailable, partitioned, or stale. +7. **Instrument correctness and load.** Track stale-read rate, invalidation lag, rebuild lag, hit/miss, entry-size rejects, cold-cache state, and downstream saturation. Set hit-rate alerts tight against the normal operating point — at high hit rates, a small absolute drop translates to a multiplicative increase in backing load (a hit rate falling from 95% to 85% triples the miss rate, not doubles it), so alarming on a fixed absolute floor misses the operating-point sensitivity. +8. **Plan repair.** Include manual and automated invalidation/rebuild with verification. + +## Synthesized Default + +Use explicit TTLs, version-aware invalidation, request coalescing, downstream protection, stale-read observability, and repair paths. Treat cache invalidation as part of the write path and derived-state maintenance as an operational responsibility; never let the cache become the only correctness check. + + + +## Phase Behavior + +- Ideation: identify risks, defaults, unknowns, options, and the next decision before code exists. +- Design: shape the target artifact, tradeoffs, checks, and details to gather. +- Development: guide sequencing, code boundaries, checks, and acceptance criteria. +- Testing: define release-blocking tests, evals, fixtures, and failure probes. +- Release: define rollout, observability, abort, rollback, and readiness details. +- Maintenance: define owners, drift checks, cleanup triggers, and refresh cadence. +- Existing artifact: use current code, docs, telemetry, incidents, or diffs as context for the next engineering decision; do not wait for a finished artifact before guiding design, build, release, or operation. +- Missing details: state assumptions and say what to check next instead of blocking lifecycle guidance. + +## Exceptions + +- Write-through or write-behind can be appropriate only when write amplification, durability, ordering, and failure semantics are explicit. +- Stale-while-revalidate is useful when stale data is acceptable and marked by freshness policy. +- Negative caching needs short TTLs and careful invalidation for newly created resources. +- Derived views may rebuild from source data instead of backing up if rebuild time fits recovery objectives. + +## Response Quality Bar + +- Lead with the cache correctness decision, mitigation plan, or production blockers. +- Cover freshness, invalidation, stampede behavior, fallback, source-of-truth semantics, and observability before optional cache topics. +- Make recommendations actionable with checks, stop conditions, and rollback or bypass actions where relevant. +- Name the details to inspect, such as TTLs, hit/miss rates, source update events, stale-read bounds, and dependency saturation; do not state details you have not seen. +- Stay technology-agnostic by default: do not introduce provider, product, framework, database, protocol, or command names unless the user supplied them or explicitly requested tool-specific guidance. +- Stay inside cache and derived-data behavior. Route broader storage consistency or dependency overload only when materially unresolved. +- Be concise: avoid generic caching background and prefer compact consistency and mitigation tables. + +## Required Outputs + +- Cache or derived-data decision record. +- Key, writer, invalidator, reader, and responsibility map. +- Freshness, TTL, invalidation, and versioning policy. +- Stampede and miss-amplification protection plan. +- Failure/degradation behavior. +- Cache-loss and cold-cache behavior, including entry-size bounds and backing-load impact. +- Metrics and alerts for freshness, stale reads, rebuilds, and downstream load. +- Repair/rebuild runbook and verification checks. + +## Checks Before Moving On + +- `freshness_check`: max staleness, TTL, and user-visible stale behavior are explicit. +- `invalidation_map`: writers, invalidators, readers, and versioning/repair paths are documented. +- `stampede_check`: miss storm and hot-key behavior are bounded. +- `cache_loss_behavior`: cold, flushed, unavailable, or partitioned cache behavior is defined. +- `cache_size_bound`: cache entries have size bounds or visibility into oversized entries. +- `downstream_check`: backing dependency capacity under cold/miss conditions is modeled. +- `repair_check`: rebuild/invalidate/repair runbook and correctness verification exist. + +## Red Flags - Stop And Rework + +- TTL is the only invalidation strategy for correctness-sensitive data. +- Cache miss paths can fan out enough to overload backing systems. +- Writers and invalidators are maintained by different projects with no contract. +- Stale entries are possible but not observable. +- Rebuild or reindex time is longer than the business recovery expectation. + +## Common Mistakes + +| Mistake | Correction | +| --- | --- | +| Calling cache a performance-only detail | Treat it as correctness and availability behavior. | +| Hiding stale reads | Measure and expose freshness. | +| Ignoring cold starts | Model cache cold, location failover, and bulk invalidation. | +| Invalidating globally by default | Prefer scoped, versioned, or staged repair when possible. | diff --git a/plugins/sirmarkz/staff-engineer-mode/specialists/code-readability-for-agents.md b/plugins/sirmarkz/staff-engineer-mode/specialists/code-readability-for-agents.md new file mode 100644 index 00000000..f127f2f5 --- /dev/null +++ b/plugins/sirmarkz/staff-engineer-mode/specialists/code-readability-for-agents.md @@ -0,0 +1,147 @@ +--- +name: code-readability-for-agents +description: "Use when repo structure, boundaries, naming, file size, or canonical paths affect AI-agent code comprehension" +--- + +# Code Readability For Agents + +## Iron Law + +``` +IF AN AGENT CANNOT LOCATE THE CANONICAL IMPLEMENTATION IN ONE TOOL CALL, THE STRUCTURE IS WRONG +``` + +Indirection that humans tolerate because they remember where things live becomes silent failure when an agent edits the wrong file, recreates a function that already exists, or hallucinates a helper that almost-but-not-quite matches the real one. + +## Overview + +Produces a repository legibility map for AI comprehension: a module-boundary map, a list of names that collide or mislead code search, a function-and-file size report against a defined budget, and a set of naming and layout patches that let an agent reach the canonical implementation in one tool call. Refuses to call code "clean" when an agent has to read three files to find where a behavior actually lives. + +**Core principle:** the repository is read by agents at least as often as by humans now. If the agent cannot find the canonical implementation deterministically, the structure is wrong, not the agent. + +## When To Use + +- The user is shaping repo structure, module boundaries, names, or canonical paths so AI coding agents can find and modify the right code. +- The user asks why their AI coding agent keeps editing the wrong file, recreating existing functions, or producing diffs that almost-but-not-quite match the local convention. +- A codebase is being prepared for AI-assisted contribution and you want to reduce wrong-file edits and hallucinated helpers. +- A repo has god files, files that exceed sensible read budgets, or modules whose names do not predict their contents. +- Code search returns multiple plausible matches for common verbs (`process`, `handle`, `update`, `run`) and the agent guesses wrong. +- A refactor is being planned and you want module boundaries that future agents can reason about, not only humans. +- Onboarding (human or agent) takes longer than the work justifies because canonical implementations are buried under indirection. + +## When Not To Use + +- The work is broad architectural decision-making across services or system boundaries; use `architecture-decisions`. +- The work is dependency cleanup, dead-code removal, or static-analysis findings on existing code; use `dependency-and-code-hygiene`. +- The work is org-level rules for AI-assisted coding (acceptance checks, data boundaries, protected paths); use `ai-coding-governance`. +- The work is checking one specific agent diff before merge; use `agent-pr-review`. +- The work is documentation lifecycle, responsibility, or freshness of engineering docs; use `documentation-lifecycle`. +- The work is API contract design or backwards compatibility on exposed surfaces; use `api-design-and-compatibility`. +- The work is generic review routing, change-size limits, or workflow metrics with no repository legibility issue; no routed specialist applies. + +## Info To Gather + +- Current work phase, next decision, what is known, and assumptions where details are missing. +- Repository scope: which directories are in scope, which are vendored or generated and excluded, and which are intentionally legacy. +- Agent traces if available: examples of recent agent runs where the agent edited the wrong file, missed the canonical implementation, or recreated a helper. +- Current module map: top-level packages or directories, stated responsibilities, and the actual exports each exposes. +- Naming inventory: function and class names that recur across modules, public verbs used as names, and any names that collide on case or near-case. +- File and function size distribution: largest files, longest functions, deepest nesting, and the size budget you have agreed (or the absence of one). +- Search hit-rate signal: for the common verbs and nouns of the domain, how many candidate matches a code search returns and how an outsider would pick one. +- Test placement convention: tests next to code, in a parallel tree, or scattered; the agent's ability to find tests for a given function predicts the agent's ability to verify changes. +- Doc co-location: whether each module has a short README or doc string that names its responsibility, public surface, and non-obvious invariants. +- Examples of canonical implementations you agree should be the only place a given behavior is implemented. + +## Workflow + +1. **Map the repo as the agent sees it.** List top-level modules and the verbs/nouns each exposes. Record any module whose name does not predict its responsibility. +2. **Run the one-tool-call test.** For a list of representative behaviors ("how does authentication happen," "where is the rate limit applied," "what validates this input"), check whether a single grep, symbol search, or doc lookup lands on the canonical file. Behaviors that fail the test become the first findings. +3. **Find name collisions.** Surface duplicate or near-duplicate function and class names across modules, especially common verbs (`process`, `handle`, `update`, `run`, `apply`, `save`). Each collision is a candidate disambiguation patch. +4. **Identify god files.** List files that exceed the size budget, hold more than one responsibility, or mix public surface with internal helpers. Each is a candidate split. +5. **Identify oversized functions.** List functions whose length, branching depth, or argument count exceed the budget. Long functions are unsearchable by behavior; an agent finds the file but not the responsibility within it. +6. **Identify ambiguous module boundaries.** Surface modules whose exports are partly used by callers that should not depend on them, modules that import caller modules, and modules whose stated purpose contradicts their actual exports. +7. **Check the canonical-implementation rule.** For each behavior you maintain, confirm there is one and only one implementation. Multiple plausible implementations are an agent failure mode in waiting; the agent will pick the wrong one. +8. **Check test discoverability.** Confirm a function's tests can be located by an agent using only the function's name and the repo convention. Hidden test mappings are a behavior-verification gap. +9. **Check doc co-location.** Confirm each module has a short, current statement of its responsibility, public surface, and invariants. A doc that lies is worse than no doc; flag stale docs as findings. +10. **Propose patches.** Issue concrete patches: rename collisions, split god files, extract internal helpers behind a clear public surface, move misplaced exports, add or correct module-level docs, and consolidate duplicate behavior into a single canonical site. +11. **Set the agent-search heuristic.** Document the conventions an agent should follow to find code in this repo (where canonical handlers live, where validators live, where adapters live, where tests live) and the conventions a contributor must follow to keep them true. +12. **Score the legibility.** Produce a scorecard: percent of representative behaviors that pass the one-tool-call test, count of collisions, count of god files, count of oversized functions, and count of modules with stale or missing co-located docs. + +## Synthesized Default + +Optimize the repository for one-tool-call discovery. Keep modules narrow and predictably named. Keep files and functions inside a defined size budget. Disambiguate common verbs in names. Co-locate tests and docs. Maintain a single canonical implementation per behavior. Document the agent-search heuristic so contributors keep it true. Treat repository legibility as a first-class engineering quality, not a refactor that happens "when there is time." + + + +## Phase Behavior + +- Ideation: identify risks, defaults, unknowns, options, and the next decision before code exists. +- Design: shape the target artifact, tradeoffs, checks, and details to gather. +- Development: guide sequencing, code boundaries, checks, and acceptance criteria. +- Testing: define release-blocking tests, evals, fixtures, and failure probes. +- Release: define rollout, observability, abort, rollback, and readiness details. +- Maintenance: define owners, drift checks, cleanup triggers, and refresh cadence. +- Existing artifact: use current code, docs, telemetry, incidents, or diffs as context for the next engineering decision; do not wait for a finished artifact before guiding design, build, release, or operation. +- Missing details: state assumptions and say what to check next instead of blocking lifecycle guidance. + +## Exceptions + +- Generated code may exceed the size budget if the generator is maintained and the file is not edited by hand; mark it generated and exclude it from the legibility score. +- Deliberately legacy modules under active replacement may keep their shape until cutover; record the exception, cutover condition, and concrete next patch. +- Domain-driven naming may require domain words that look ambiguous to outsiders but are precise inside the domain; the disambiguation lives in the module-level doc. +- Performance-critical code may justify a longer function or denser file when splitting would cost measured throughput; record the measurement and the check path that keeps the exception honest. + +## Response Quality Bar + +- Lead with the legibility map, the one-tool-call failures, the renaming or splitting patches, or the agent-search heuristic requested. +- Cover module-boundary findings, name collisions, file and function size against the budget, canonical-implementation duplications, and test/doc discoverability before optional refactor breadth. +- Make recommendations actionable with file paths, exact rename targets, split boundaries, and the agent-search rule each patch protects. +- Name the details to inspect, such as code-search hit counts, file/function size measurements, agent traces where available, and the representative behaviors used for the one-tool-call test; do not state legibility without the test results. +- Stay technology-agnostic by default: do not introduce provider, product, framework, database, protocol, or command names unless the user supplied them or explicitly requested tool-specific guidance. +- Stay inside repository legibility for AI comprehension. Route system architecture, dead-code cleanup, doc lifecycle, agent controls, and per-diff review to the responsible specialist. +- Be concise: prefer compact finding tables and patch lists over generic clean-code prose. + +## Required Outputs + +- Module-boundary map with stated responsibility, actual exports, and any contradictions. +- One-tool-call test results: a list of representative behaviors with the search query used, the candidate matches returned, and pass/fail. +- Name-collision list with each colliding name, the modules it appears in, and the proposed disambiguating renames. +- File and function size report against a stated budget, with the worst offenders listed and split or extraction patches proposed. +- Canonical-implementation report listing behaviors that have more than one plausible implementation and the proposed consolidation patch. +- Test and doc discoverability report identifying functions whose tests are not findable by convention and modules whose co-located docs are missing or stale. +- Patch list: concrete renames, file splits, module-doc additions or corrections, and consolidations, each with file paths. +- Agent-search heuristic documenting where canonical handlers, validators, adapters, and tests live in this repo, with the contributor rule that keeps it true. +- Legibility scorecard: percent passing the one-tool-call test, collision count, god-file count, oversized-function count, and stale-doc count. + +## Checks Before Moving On + +- `boundary_map_present`: the map lists modules with stated responsibility and contradictions are named. +- `one_tool_call_test`: representative behaviors are tested for one-tool-call discovery; failures are listed with the search used. +- `collision_inventory`: colliding or near-colliding names are listed with their modules and proposed disambiguations. +- `size_budget_check`: a file and function size budget is stated and offenders are listed against it. +- `canonical_uniqueness`: behaviors with more than one plausible implementation are listed with consolidation patches. +- `discoverability_check`: tests and module docs are findable by convention or are flagged as gaps. +- `agent_search_heuristic`: a written convention for where canonical handlers, validators, adapters, and tests live is produced and is consistent with the patches recommended. +- `patch_actionable`: each recommended patch names the file or module, the exact change, and the legibility rule it protects. + +## Red Flags - Stop And Rework + +- The one-tool-call test is skipped because "you know where everything is." +- A behavior has two plausible implementations and the recommendation picks one without consolidating the other. +- Renames are proposed without sweeping callers, tests, and docs. +- A god file is "split" by moving code to a new file with the same responsibility, leaving two god files. +- The agent-search heuristic is written but contradicts the actual file layout the patches produce. +- Module docs are added that restate names rather than declaring responsibility, public surface, and invariants. +- Performance or legacy exceptions lack measurement, expiry, or a concrete cleanup patch. + +## Common Mistakes + +| Mistake | Correction | +| --- | --- | +| Optimizing only for human readability | Test the one-tool-call rule; humans tolerate indirection that breaks agents. | +| Naming functions with bare verbs | Disambiguate with the noun the verb acts on; reserve common verbs for canonical sites. | +| Letting common behaviors live in many files | Consolidate to one canonical implementation; delete or redirect the others. | +| Splitting god files by line count | Split by responsibility; two equally-mixed files are not progress. | +| Documenting modules with restated names | Document responsibility, public surface, and non-obvious invariants. | +| Hiding tests in a parallel tree without convention | Co-locate or document the mapping rule so an agent can find tests by name. | +| Treating legibility as a one-time refactor | Make the agent-search heuristic a contributor rule; guard against regression. | diff --git a/plugins/sirmarkz/staff-engineer-mode/specialists/configuration-and-automation-safety.md b/plugins/sirmarkz/staff-engineer-mode/specialists/configuration-and-automation-safety.md new file mode 100644 index 00000000..cfdfe239 --- /dev/null +++ b/plugins/sirmarkz/staff-engineer-mode/specialists/configuration-and-automation-safety.md @@ -0,0 +1,136 @@ +--- +name: configuration-and-automation-safety +description: "Use when one-shot config changes, scripts, cleanup automation, overrides, or drift fixes touch production state" +--- + +# Configuration And Automation Safety + +## Iron Law + +``` +NO CONFIG OR AUTOMATION CHANGE WITHOUT VALIDATION, PREVIEW, BLAST RADIUS, CONFIRMATION, AND RECOVERY PATH +``` + +If the change cannot be checked before execution and reversed or contained after failure, it is not safe enough. + +## Overview + +Configuration and automation can change production faster than ordinary code paths expose. + +**Core principle:** treat config, generated changes, and operational automation as production code with explicit schema, preview, user confirmation, and recovery results. + +## When To Use + +- The user asks about configuration safety, generated changes, operational scripts, bulk automation, feature settings, policy defaults, or config validation. +- A non-code change can alter routing, permissions, capacity, customer experience, data handling, or operational behavior. +- Automation creates, updates, deletes, migrates, or remediates production state. +- A pre-launch or unlaunched production environment can affect real users, data, credentials, capacity, or recovery expectations. +- Configuration drift, copy-paste settings, or untracked overrides are causing incidents. + +## When Not To Use + +- The main question is production rollout sequencing; use `progressive-delivery` instead. +- The main question is declarative infrastructure, admission, or drift reconciliation; use `infrastructure-and-policy-as-code` instead. +- The main question is dependency cleanup or package updates; use `dependency-and-code-hygiene` instead. +- The request is one-off local scripting with no production or shared-state risk. + +## Info To Gather + +- Current work phase, next decision, what is known, and assumptions where details are missing. +- Config or automation surface, consumers, environments, affected production state, and local change path. +- Schema, allowed values, defaults, invariants, dependency ordering, and unsafe combinations. +- Change path, approval path, user confirmation, preview or dry-run output, execution identity, and change record. +- Blast radius, rollback or disable path, rate limit, lock, retry, and idempotency behavior. +- Operational levers: name, expected effect, activation time, prerequisites, last test, and disable or revert path. +- Change class and confirmation path: low-risk, standard production, or emergency; checks to make before the user proceeds. +- Prior incidents, drift reports, manual overrides, and exception rules. + +## Workflow + +1. **Classify the surface and change class.** Separate static config, dynamic config, generated changes, scheduled automation, and emergency automation; name the change class as low-risk, standard production, or emergency, with a distinct confirmation path for each class. +2. **Define the contract.** Specify schema, defaults, bounds, invariants, local change path, and incompatible combinations. +3. **Record production changes.** For production-impacting changes, including pre-launch production, capture user confirmation, confirmation basis, expected blast radius, and recovery path before execution. +4. **Validate before execution.** Require parse, semantic, dependency, permission, and environment checks before production use. +5. **Preview the effect.** Show intended creates, updates, deletes, traffic impact, permission changes, and affected systems before apply. +6. **Bound execution.** Use batches, locks, rate limits, stop criteria, and idempotency for automation that touches shared state. +7. **Make recovery concrete.** Define rollback, disable, restore, or roll-forward behavior for config, generated changes, and automation side effects. +8. **Prepare operational levers.** For emergency adjustment or recovery levers, state the effect, prerequisites, activation time, last test, and disable or revert path before relying on them. +9. **Control drift.** Detect unmanaged overrides and stale settings; decide reconcile, exception, or removal. +10. **Close the loop.** Record user confirmation, validation output, preview, execution result, and cleanup for temporary settings. + +## Synthesized Default + +Use typed config contracts, deterministic validation, effect preview, small execution batches, explicit user confirmation for production-impacting work, linked change records, drift checks, and tested recovery paths. Automation should be idempotent by default and should fail closed when it cannot confirm the intended target. + + + +## Phase Behavior + +- Ideation: identify risks, defaults, unknowns, options, and the next decision before code exists. +- Design: shape the target artifact, tradeoffs, checks, and details to gather. +- Development: guide sequencing, code boundaries, checks, and acceptance criteria. +- Testing: define release-blocking tests, evals, fixtures, and failure probes. +- Release: define rollout, observability, abort, rollback, and readiness details. +- Maintenance: define owners, drift checks, cleanup triggers, and refresh cadence. +- Existing artifact: use current code, docs, telemetry, incidents, or diffs as context for the next engineering decision; do not wait for a finished artifact before guiding design, build, release, or operation. +- Missing details: state assumptions and say what to check next instead of blocking lifecycle guidance. + +## Exceptions + +- Emergency automation may run with fewer pre-change checks when delay is riskier, but it still needs user confirmation, a linked change record, stop criteria, and post-change reconciliation. +- Low-risk local config can use lighter checks if it cannot affect shared systems, sensitive data, or production users. +- Some generated changes are easier to roll forward than roll back; document the recovery decision before execution. + +## Response Quality Bar + +- Lead with the safety decision, config contract, automation risk, or check matrix requested. +- Name the change class and confirmation path: low-risk changes need local validation results, standard production changes need explicit user confirmation plus preview output, and emergency changes need user confirmation plus post-change reconciliation. +- Cover validation, preview, blast radius, execution controls, drift handling, and recovery before optional automation detail. +- Make recommendations actionable with validation checks, stop criteria, batch size, linked change records, and cleanup where relevant. +- Name the details to inspect, such as schema, preview output, user confirmation, execution logs, drift reports, and rollback checks; do not state details you have not seen. +- Stay technology-agnostic by default: do not introduce provider, product, framework, database, protocol, or command names unless the user supplied them or explicitly requested tool-specific guidance. +- Stay inside config and automation safety. Use rollout, infrastructure policy, or dependency hygiene skills only when that surface is the immediate risk. +- Be concise: prefer compact contract and check tables over generic automation advice. + +## Required Outputs + +- Configuration or automation safety decision. +- Change class and confirmation path: low-risk, standard production, or emergency, with required checks and decision rationale. +- Production change record with user confirmation, expected effect, blast radius, and recovery results where the change can affect production state. +- Contract: schema, defaults, invariants, unsafe combinations, allowed overrides, and local change path. +- Validation and preview check list. +- Blast-radius and execution-control plan. +- Recovery plan for rollback, disable, restore, or roll-forward. +- Operational lever inventory with expected effect, activation time, prerequisites, last test, and disable or revert path. +- Drift detection and exception rules. +- Approval, execution, and cleanup checklist. + +## Checks Before Moving On + +- `change_class_confirmed`: low-risk, standard production, or emergency class is named with the required checks for that class. +- `change_record`: production-impacting config or automation has linked preview, user confirmation, execution identity, and recovery results. +- `contract_defined`: schema, defaults, bounds, invariants, and local change path are explicit. +- `preview_checked`: intended production effect is visible before execution. +- `blast_radius`: affected users, systems, and data are bounded. +- `recovery_path`: rollback, disable, restore, or roll-forward path is defined. +- `lever_ready`: emergency adjustment or recovery levers have named effect, prerequisites, activation path, and disable or revert path. +- `lever_tested`: operational levers have a recent test result or an explicit unknown. +- `change_log`: approval, validation, execution result, and exception state are linked. + +## Red Flags - Stop And Rework + +- Configuration bypasses validation because it is "not code." +- Unlaunched production is treated as non-production even though it can affect users, data, credentials, or recovery. +- Automation can delete or mutate shared state without preview. +- Defaults differ by environment without a documented reason. +- Recovery depends on remembering the previous value manually. +- Temporary overrides have no expiry or cleanup action. + +## Common Mistakes + +| Mistake | Correction | +| --- | --- | +| Valid syntax as safety | Add semantic, dependency, and blast-radius checks. | +| One giant automation run | Use batches, locks, stop criteria, and idempotency. | +| Silent config drift | Detect, reconcile, or exception-check unmanaged changes. | +| Rollback by memory | Record prior state and verify recovery. | diff --git a/plugins/sirmarkz/staff-engineer-mode/specialists/cost-aware-reliability.md b/plugins/sirmarkz/staff-engineer-mode/specialists/cost-aware-reliability.md new file mode 100644 index 00000000..711628fb --- /dev/null +++ b/plugins/sirmarkz/staff-engineer-mode/specialists/cost-aware-reliability.md @@ -0,0 +1,129 @@ +--- +name: cost-aware-reliability +description: "Use when cost spikes, unit economics, or spend cuts must preserve reliability and SLO headroom" +--- + +# FinOps And Cost Aware Reliability + +## Iron Law + +``` +NO COST CUT WITHOUT SLO, HEADROOM, BLAST-RADIUS, AND REGRESSION CHECK +``` + +If a saving silently consumes reliability margin, it is a risk decision, not an optimization. + +## Overview + +Cost is an operational signal, but reliability headroom is not waste by default. + +**Core principle:** optimize unit economics while preserving explicit reliability, capacity, recovery, and safety targets. + +## When To Use + +- The user asks about cost/reliability tradeoffs, unit economics, capacity headroom, tagging/allocation, cost regressions, reserved/committed/interruptible capacity mix, or budget-aware reliability. +- A service needs to reduce cost while maintaining an SLO or launch target. +- A cost spike may indicate traffic, inefficiency, abuse, deployment regression, or capacity misconfiguration. +- The user asks how much reliability headroom is justified. + +## When Not To Use + +- The user asks pure billing support, procurement, contracts, or vendor negotiation; out of scope. +- The main topic is performance/capacity with no cost tradeoff; use `performance-and-capacity` instead. +- The issue is public abuse causing cost; use `edge-traffic-and-ddos-defense` instead too. +- The request is financial reporting not tied to engineering decisions. + +## Info To Gather + +- Current work phase, next decision, what is known, and assumptions where details are missing. +- Service tier, SLOs, traffic, capacity model, failover headroom, and degradation behavior. +- Unit metrics: request, tenant, job, dataset, device, model inference, or business transaction. +- Cost allocation: environment, tenant/customer, feature, location, and workload class. +- Scaling policies, reserved/committed/interruptible mix, idle resources, and peak patterns. +- Data transfer, cross-location replication, telemetry/log volume, managed service overhead, and external traffic costs. +- Recent deploys, traffic changes, incidents, abuse signals, and cost regressions. +- Reliability risk tolerance and confirmation path for reducing headroom. + +## Workflow + +1. **State the reliability constraint.** Identify SLO, capacity headroom, failover target, and recovery requirement before cutting cost. +2. **Define unit cost.** Choose a meaningful engineering unit and map cost to service, feature, tenant, or workload. +3. **Find cost drivers.** Separate traffic growth, inefficient code, overprovisioning, idle capacity, data transfer, cross-location replication, telemetry/log volume, storage growth, retries, and abuse. +4. **Protect headroom.** Distinguish waste from required peak, failover, and surge capacity. +5. **Choose optimizations.** Use right-sizing, scheduling, storage lifecycle, caching, batching, data-transfer reduction, telemetry sampling/retention controls, capacity mix, or code efficiency where risk is explicit. +6. **Model commitment risk.** For committed capacity or discounts, state forecast confidence, lock-in window, unused commitment risk, exit path, and what reliability headroom is protected. +7. **Model tradeoffs.** State expected savings, reliability impact, security/operations side effects, blast radius, rollback, and monitoring. +8. **Add guardrails.** Alert on cost regressions, unit-cost anomalies, and reliability signals after changes. +9. **Check continuously.** Treat cost anomalies like operational regressions with post-change verification. + +## Synthesized Default + +Optimize unit cost with allocation, anomaly detection, right-sizing, and capacity-mix decisions, while preserving SLOs, required headroom, and recovery posture. Reliability-risk tradeoffs must be explicit and user-accepted; cheapest is not automatically cost-optimized. + + + +## Phase Behavior + +- Ideation: identify risks, defaults, unknowns, options, and the next decision before code exists. +- Design: shape the target artifact, tradeoffs, checks, and details to gather. +- Development: guide sequencing, code boundaries, checks, and acceptance criteria. +- Testing: define release-blocking tests, evals, fixtures, and failure probes. +- Release: define rollout, observability, abort, rollback, and readiness details. +- Maintenance: define owners, drift checks, cleanup triggers, and refresh cadence. +- Existing artifact: use current code, docs, telemetry, incidents, or diffs as context for the next engineering decision; do not wait for a finished artifact before guiding design, build, release, or operation. +- Missing details: state assumptions and say what to check next instead of blocking lifecycle guidance. + +## Exceptions + +- Non-critical batch or preemptible workloads may use cheaper interruptible capacity if retries, deadlines, and data correctness are safe. +- Emergency cost controls can temporarily degrade non-critical features if user impact and rollback are explicit. +- Regulated, safety-critical, or tier-1 systems may keep high headroom even when utilization looks inefficient. +- Public abuse cost spikes should use `edge-traffic-and-ddos-defense` instead. +- Small estates may not justify heavy allocation pipelines; use coarse unit tracking until savings exceed instrumentation cost. + +## Response Quality Bar + +- Lead with the unit-cost model, cost driver, reliability tradeoff, optimization plan, or anomaly diagnosis requested. +- Cover allocation, unit metrics, driver separation, SLO/headroom preservation, failure-condition capacity, rollback, anomaly monitoring, and refresh cadence before optional FinOps breadth. +- Make recommendations actionable with metrics, savings ranges, risk acceptance, stop criteria, rollback steps, and post-change checks where relevant. +- Name the details to inspect, such as spend by usage units, traffic, capacity headroom, SLOs, peak/failure demand, deploy markers, anomaly timeline, and retry/abuse signals; do not state details you have not seen. +- Stay technology-agnostic by default: do not introduce provider, product, framework, database, protocol, or command names unless the user supplied them or explicitly requested tool-specific guidance. +- Stay inside cost-aware reliability. Route capacity, edge defense, platform, or data work only when those are the central unresolved risk. +- Be concise: avoid generic cost advice and prefer compact unit-cost, driver, and tradeoff tables. + +## Required Outputs + +- Unit-cost model and allocation plan. +- Cost driver analysis. +- Data-transfer, telemetry, and cross-location cost assessment where applicable. +- Reliability/headroom tradeoff record. +- Optimization plan with savings estimate, risk, and rollback. +- Commitment-risk record for reserved, prepaid, interruptible, or long-window capacity decisions. +- Cost anomaly and unit-regression dashboard requirements. +- Refresh cadence for cost signals. +- Follow-up routes to capacity, edge defense, platform, or data skills as needed. + +## Checks Before Moving On + +- `unit_check`: cost metric maps to an engineering unit and response path. +- `slo_headroom`: SLO, peak, and failure-condition headroom are preserved or risk is accepted. +- `driver_check`: cost drivers are separated before recommending cuts. +- `rollback_check`: optimization has rollback or mitigation plan. +- `regression_check`: post-change cost and reliability signals are monitored. + +## Red Flags - Stop And Rework + +- Cost reduction removes failover capacity without changing SLO or accepting risk. +- Only total monthly spend is tracked; no unit metric or response path exists. +- Idle capacity is labeled waste without peak/failure analysis. +- Interruptible capacity is used for work that cannot safely retry. +- Cost anomaly investigation ignores deploys, retries, abuse, and data growth. + +## Common Mistakes + +| Mistake | Correction | +| --- | --- | +| Cutting before modeling risk | State SLO, headroom, and failure scenarios first. | +| Optimizing total spend only | Use unit economics tied to engineering responsibility. | +| Treating cost as finance-only | Add operational alerts and regression reviews. | +| Hiding tradeoffs | Record reliability risk and confirmation. | diff --git a/plugins/sirmarkz/staff-engineer-mode/specialists/cryptography-and-key-lifecycle.md b/plugins/sirmarkz/staff-engineer-mode/specialists/cryptography-and-key-lifecycle.md new file mode 100644 index 00000000..ce466160 --- /dev/null +++ b/plugins/sirmarkz/staff-engineer-mode/specialists/cryptography-and-key-lifecycle.md @@ -0,0 +1,125 @@ +--- +name: cryptography-and-key-lifecycle +description: "Use when certificates, signing keys, secrets, algorithms, or crypto material need rotation or migration planning" +--- + +# Crypto Agility And Cert Lifecycle + +## Iron Law + +``` +EVERY KEY, CERT, AND ALGORITHM HAS AN EXPIRY DATE AND A TESTED REPLACEMENT PATH +``` + +If a certificate, key, algorithm, or trust root cannot be replaced safely on demand, the system is brittle. "Tested" means the replacement path has been exercised at least once outside an emergency, not just documented. + +## Overview + +Cryptography fails operationally when keys, certificates, algorithms, and trust roots cannot be inventoried or changed before a deadline. + +**Core principle:** keep cryptographic dependencies discoverable, maintained, renewable, replaceable, monitored, and tested before expiry or algorithm transition becomes an incident. + +## When To Use + +- The user asks about certificate expiry, key rotation, cryptographic algorithm transition, trust-chain changes, renewal automation, or cryptographic agility. +- A service depends on certificates, keys, signing, encryption, trust roots, or cryptographic policies that can expire or become deprecated. +- Rotation, revocation, renewal, or algorithm migration could break clients, jobs, devices, or partner integrations. +- You need checks that cryptographic material is inventoried, expiring, monitored, and replaceable. + +## When Not To Use + +- The main topic is identity authorization, secret storage, or service access policy; use `identity-and-secrets` instead. +- The main topic is artifact provenance or release signing; use `software-supply-chain-security` instead. +- The main topic is secure design broadly; use `secure-sdlc-and-threat-modeling` instead. +- The request is abstract cryptographic research with no engineering lifecycle decision. + +## Info To Gather + +- Current work phase, next decision, what is known, and assumptions where details are missing. +- Inventory of certificates, keys, algorithms, trust roots, consumers, expiry dates, and renewal paths. +- Usage context: authentication, encryption, signing, verification, storage, transport, or partner integration. +- Rotation process, automation, manual steps, confirmation, access logs, and emergency revocation path. +- Client and dependency compatibility, trust-store update path, fallback behavior, and rollback or roll-forward limits. +- Monitoring, alert thresholds, test environment coverage, and prior expiry or rotation incidents. +- Deprecation deadline, transition target, exception and compensating controls. + +## Workflow + +1. **Inventory dependencies.** Find cryptographic material, algorithms, trust roots, consumers, and expiry or deprecation dates. +2. **Classify use.** Separate authentication, confidentiality, integrity, signing, verification, and storage use cases. +3. **Assess agility.** Determine whether each dependency can be renewed, rotated, revoked, or replaced without coordinated outage. +4. **Check compatibility.** Test old/new material and algorithm combinations with representative clients and workloads. +5. **Automate renewal carefully.** Use monitored renewal paths with alerting, audit, and failed-renewal response. Trigger renewal well before expiry — for example, at roughly two-thirds of the credential's lifetime — so that a single failed renewal cycle has time to be detected and retried before the credential expires. +6. **Rotate without coordinated downtime.** Default to a dual-credential overlap sequence: issue the new credential, configure verifiers to accept both old and new, migrate producers and clients to the new credential, verify zero traffic uses the old, then revoke. The verify-zero-old-traffic check is what makes the rotation zero-downtime; rotations that skip it convert routine rotation into an outage. +7. **Plan transitions.** Define overlap, dual support, rollout order, client migration, and retirement checks for deprecated algorithms or trust roots. +8. **Prepare emergency response.** Document revocation, compromise response, rollback or roll-forward, and communication path. +9. **Close exceptions.** Track unsupported material with expiry, risk, and compensating controls. + +## Synthesized Default + +Use a cryptographic inventory, expiry monitoring, tested rotation, dual-support transition windows, compatibility checks, emergency revocation plan, and exception register. Prefer designs where cryptographic material can be replaced independently of full application redeploys. + + + +## Phase Behavior + +- Ideation: identify risks, defaults, unknowns, options, and the next decision before code exists. +- Design: shape the target artifact, tradeoffs, checks, and details to gather. +- Development: guide sequencing, code boundaries, checks, and acceptance criteria. +- Testing: define release-blocking tests, evals, fixtures, and failure probes. +- Release: define rollout, observability, abort, rollback, and readiness details. +- Maintenance: define owners, drift checks, cleanup triggers, and refresh cadence. +- Existing artifact: use current code, docs, telemetry, incidents, or diffs as context for the next engineering decision; do not wait for a finished artifact before guiding design, build, release, or operation. +- Missing details: state assumptions and say what to check next instead of blocking lifecycle guidance. + +## Exceptions + +- Emergency compromise response may skip ordinary rollout windows, but must preserve audit, and recovery results. +- Legacy clients may require overlap windows; keep them time-bound with usage telemetry and migration checks. +- Low-risk development material can use lighter monitoring if isolated from production trust paths. + +## Response Quality Bar + +- Lead with the lifecycle risk, rotation plan, transition decision, or expiry blocker requested. +- Cover inventory, responsibility, expiry, rotation, compatibility, monitoring, emergency revocation, transition windows, and exceptions before optional cryptographic detail. +- Make recommendations actionable with dates, checks, alert thresholds, compatibility tests, and retirement criteria where relevant. +- Name the details to inspect, such as inventory, expiry data, consumer list, rotation test output, renewal logs, alert rules, and exception records; do not state details you have not seen. +- Stay technology-agnostic by default: do not introduce provider, product, framework, database, protocol, or command names unless the user supplied them or explicitly requested tool-specific guidance. +- Stay inside cryptographic lifecycle. Use identity, supply-chain, or secure-design skills only when those surfaces drive the main decision. +- Be concise: prefer inventory and transition matrices over broad cryptography explanation. + +## Required Outputs + +- Cryptographic dependency inventory. +- Consumer, expiry, and renewal map. +- Rotation and renewal plan. +- Compatibility and dual-support test plan. +- Algorithm or trust-root transition plan. +- Monitoring and alert policy for expiry and failed renewal. +- Emergency revocation and compromise response. +- Exception register with expiry, and compensating control. + +## Checks Before Moving On + +- `inventory_owned`: cryptographic material, algorithms, trust roots, consumers, and expiry dates are visible. +- `rotation_test`: renewal, rotation, or replacement is tested for representative consumers. +- `compatibility_window`: old/new compatibility and overlap duration are explicit. +- `expiry_monitoring`: expiry and failed-renewal alerts have a response path. +- `transition_check`: deprecated algorithms or trust roots have migration and retirement criteria. + +## Red Flags - Stop And Rework + +- Certificates are discovered only when expiry alerts fire. +- A key can be created but not rotated or revoked safely. +- Old and new trust paths are never tested together. +- Manual renewal depends on one person remembering a calendar date. +- Deprecated algorithms remain because clients are unknown. + +## Common Mistakes + +| Mistake | Correction | +| --- | --- | +| Inventory only at issuance | Continuously track consumers, and expiry. | +| Rotation without compatibility | Test old/new overlap before rollout. | +| Renewal without alerting | Monitor expiry and failed automation. | +| Permanent exceptions | Require risk, and retirement check. | diff --git a/plugins/sirmarkz/staff-engineer-mode/specialists/data-contracts.md b/plugins/sirmarkz/staff-engineer-mode/specialists/data-contracts.md new file mode 100644 index 00000000..a5bce6e6 --- /dev/null +++ b/plugins/sirmarkz/staff-engineer-mode/specialists/data-contracts.md @@ -0,0 +1,124 @@ +--- +name: data-contracts +description: "Use when designing shared schemas, events, datasets, files, streams, or domain interfaces across components" +--- + +# Data Contracts And Domain Interfaces + +## Iron Law + +``` +NO SHARED DATA INTERFACE WITHOUT A WRITTEN CONTRACT, COMPATIBILITY RULES, AND CURRENT OR PLANNED CONSUMERS +``` + +A "shared interface" is anything another component reads — a peer service, a downstream job, a different repo, even a future-you script. The contract states field meanings, types, and validity. Compatibility rules state what counts as additive vs breaking. For a new system, name the first expected consumers and the assumptions they depend on; for an existing system, name real consumers or the unknown-consumer risk. + +> This skill assumes the data crosses a component or repo boundary. If the data model is fully private to one component with no external readers, use `architecture-decisions` instead. + +## Overview + +Data contracts let projects change independently without guessing what consumers depend on. + +**Core principle:** make producer and consumer expectations explicit, versioned, maintained, compatibility-tested, and observable. + +## When To Use + +- The user asks about data contracts, schemas, domain interfaces, producer/consumer compatibility, schema evolution rules, or contract testing across projects. +- A new shared dataset, event shape, file, stream, or domain interface is being designed before consumers exist in production. +- A field, event, dataset, file, stream, or service output is consumed outside the responsible component. +- Producers and consumers deploy independently or interpret the same data differently. +- Data meaning, compatibility, responsibility, or evolution rules are unclear. + +## When Not To Use + +- One exposed service API contract is the whole problem; use `api-design-and-compatibility` instead. +- Workflow ordering, retries, or dead-letter handling is central; use `event-workflows` instead. +- Pipeline freshness, reprocessing, or lineage is central; use `data-pipeline-reliability` instead. +- The data model is fully private to one component and has no current or planned external consumers. + +## Info To Gather + +- Current work phase, next decision, what is known, and assumptions where details are missing. +- Producers, planned or existing consumers, domain meaning, critical fields, and consumer release cadence. +- Contract format, schema location, versioning policy, compatibility modes, and deprecation rules. +- Required, optional, nullable, defaulted, derived, sensitive, and deprecated fields. +- Consumer tests, sample payloads, expected or production usage, validation failures, and unknown consumers. +- Change workflow, compatibility checks, migration windows, and rollback or dual-publish needs. + +## Workflow + +1. **Find the boundary.** Identify every planned or existing consumer that relies on the data shape, semantics, timing, or quality. +2. **Define the contract.** Record field meanings, types, requiredness, defaults, units, sensitivity, responsibility, and validity rules. +3. **Choose evolution rules.** State what changes are compatible, conditionally compatible, or breaking. +4. **Version deliberately.** Use versions when semantics break; prefer additive changes when consumers can tolerate them. +5. **Test both sides.** Add producer validation and consumer-focused compatibility checks before merge or release. +6. **Measure adoption.** Before launch, state expected consumers and acceptance checks; after launch, track consumer usage, validation failures, deprecated fields, and migration progress. +7. **Plan deprecation.** Keep overlap, telemetry, consumer notice, and removal checks for breaking or semantic changes. +8. **Use adjacent checks.** Use API, event workflow, or pipeline reliability skills when execution details dominate. + +## Synthesized Default + +Use maintained, versioned, machine-checkable contracts for shared data boundaries. Prefer additive evolution, tolerant readers, producer validation, consumer compatibility tests, usage telemetry when available, and explicit deprecation checks. Treat semantic changes as breaking even when the field shape stays the same. + + + +## Phase Behavior + +- Ideation: identify risks, defaults, unknowns, options, and the next decision before code exists. +- Design: shape the target artifact, tradeoffs, checks, and details to gather. +- Development: guide sequencing, code boundaries, checks, and acceptance criteria. +- Testing: define release-blocking tests, evals, fixtures, and failure probes. +- Release: define rollout, observability, abort, rollback, and readiness details. +- Maintenance: define owners, drift checks, cleanup triggers, and refresh cadence. +- Existing artifact: use current code, docs, telemetry, incidents, or diffs as context for the next engineering decision; do not wait for a finished artifact before guiding design, build, release, or operation. +- Missing details: state assumptions and say what to check next instead of blocking lifecycle guidance. + +## Exceptions + +- Single-component data can use lighter contracts if no independent current or planned consumers exist. +- Emergency corrections may break compatibility when wrong data is more dangerous, but need consumer impact analysis and repair plan. +- Exploratory data products can start advisory, then harden before production consumers depend on them. + +## Response Quality Bar + +- Lead with the contract decision, compatibility decision, schema evolution plan, or consumer migration requested. +- Cover planned or existing consumers, semantics, compatibility class, validation, consumer tests, telemetry where available, and deprecation checks before optional registry detail. +- Make recommendations actionable with compatibility matrix, change checks, migration batches, and removal criteria where relevant. +- Name the details to inspect, such as planned consumer assumptions, consumer inventory, schema history, sample payloads, validation output, usage telemetry, and migration status; do not state details you have not seen. +- Stay technology-agnostic by default: do not introduce provider, product, framework, database, protocol, or command names unless the user supplied them or explicitly requested tool-specific guidance. +- Stay inside shared data interfaces. Use API, workflow, or pipeline skills only when that surface is the unresolved risk. +- Be concise: prefer compact contract and compatibility matrices over generic process prose. + +## Required Outputs + +- Data contract decision with producers, planned or existing consumers, and domain meaning. +- Compatibility matrix for fields, semantics, timing, quality, and versioning. +- Validation and consumer-test plan. +- Evolution, deprecation, and migration plan with planned-consumer assumptions, telemetry where available, and removal checks. +- Sensitive-data handling notes for shared fields. +- Follow-up checks for API, workflow, or pipeline execution where needed. + +## Checks Before Moving On + +- `consumer_inventory`: planned consumers, known consumers, and unknown-consumer risk are explicit. +- `contract_defined`: field meaning, shape, requiredness, validity, and sensitivity are stated. +- `compatibility_class`: every change is classified as compatible, conditional, or breaking. +- `consumer_check`: compatibility is tested against real or representative consumer expectations. +- `migration_check`: deprecated or breaking changes have adoption telemetry and removal criteria. + +## Red Flags - Stop And Rework + +- A field keeps the same name but changes meaning. +- Producers say "nobody uses this" without usage data. +- Consumers parse undocumented fields or rely on incidental ordering. +- Validation checks shape but not required semantics. +- Deprecated fields have no removal check. + +## Common Mistakes + +| Mistake | Correction | +| --- | --- | +| Treating schema as semantics | Document meaning, units, defaults, and validity. | +| Producer-only tests | Add consumer compatibility checks. | +| Guessing consumers | Name planned consumers before launch; use telemetry and responsibility discovery after launch. | +| Breaking by cleanup | Plan overlap and removal checks. | diff --git a/plugins/sirmarkz/staff-engineer-mode/specialists/data-pipeline-reliability.md b/plugins/sirmarkz/staff-engineer-mode/specialists/data-pipeline-reliability.md new file mode 100644 index 00000000..8ad85240 --- /dev/null +++ b/plugins/sirmarkz/staff-engineer-mode/specialists/data-pipeline-reliability.md @@ -0,0 +1,124 @@ +--- +name: data-pipeline-reliability +description: "Use when designing or operating batch/streaming pipelines needing freshness SLIs, validation, lineage, or replay" +--- + +# Data Pipeline Reliability + +## Iron Law + +``` +NO CRITICAL DATASET WITHOUT FRESHNESS SLI, VALIDATION, LINEAGE, AND REPLAY PATH +``` + +If consumers cannot tell whether data is fresh and correct, the pipeline is not reliable. + +## Overview + +Critical data pipelines are production systems whose users notice stale, missing, duplicated, or incorrect data. + +**Core principle:** define freshness, completeness, correctness, lineage, replay, and recovery as explicit service guarantees. + +## When To Use + +- The user is designing, building, changing, or operating a batch or streaming pipeline and asks about freshness, correctness, completeness, lineage, missed runs, backfills, data-quality checks, or warehouse/ETL SLAs. +- Dashboards, reports, downstream services, or decisions depend on timely and correct data. +- A pipeline needs replay, reprocessing, backfill, or recovery behavior. +- The user asks how to alert on stalled or stale datasets. + +## When Not To Use + +- The main issue is model training/serving skew, model evaluation, or model rollback; use `ml-reliability-and-evaluation` instead. +- The request is service-to-service event workflow design; use `event-workflows` instead. +- The work is application database backfill execution; use `database-operations` instead. +- The question is primary data consistency semantics; use `distributed-data-and-consistency` instead. + +## Info To Gather + +- Current work phase, next decision, what is known, and assumptions where details are missing. +- Pipeline graph, datasets, consumers, schedules, triggers, and dependencies. +- Freshness, completeness, correctness, latency, backlog age, and processing-error expectations. +- Source data contracts, schemas, watermarks, checkpoints, transform versions, and publish criteria. +- Validation checks, data-quality rules, anomaly detection, and known false-positive tolerance. +- Replay/backfill capability, idempotency, side effects, retention, and correction process. +- Lineage, change history, downstream impact, and incident history. + +## Workflow + +1. **Identify critical datasets.** Name consumers, business use, local responsibility path, and consequence of stale or wrong data. +2. **Define data SLIs.** Use freshness, completeness, correctness, latency, backlog age, and processing errors where relevant. +3. **Map lineage.** Record source, transform version, schedule/watermark, publish step, and downstream consumers. +4. **Check publication.** Validate schema, required fields, ranges, referential integrity, duplicates, and business invariants before publish. +5. **Make replay safe.** Ensure reprocessing is idempotent or explicitly handles duplicates and side effects. +6. **Alert on symptoms.** Trigger urgent alerts or tickets on freshness, backlog, stalled watermarks, and quality failures, not only job failure. +7. **Create recovery runbooks.** Include backfill, replay, quarantine, correction, republish, and consumer notification. +8. **Separate ML concerns.** Route model-specific eval, drift, and training/serving skew to ML systems reliability. + +## Synthesized Default + +Treat critical pipelines like services: SLI/SLO, validation checks, lineage, idempotent replay, symptom alerts, and recovery runbooks. A successful job is not enough if published data is stale, incomplete, or wrong. + + + +## Phase Behavior + +- Ideation: identify risks, defaults, unknowns, options, and the next decision before code exists. +- Design: shape the target artifact, tradeoffs, checks, and details to gather. +- Development: guide sequencing, code boundaries, checks, and acceptance criteria. +- Testing: define release-blocking tests, evals, fixtures, and failure probes. +- Release: define rollout, observability, abort, rollback, and readiness details. +- Maintenance: define owners, drift checks, cleanup triggers, and refresh cadence. +- Existing artifact: use current code, docs, telemetry, incidents, or diffs as context for the next engineering decision; do not wait for a finished artifact before guiding design, build, release, or operation. +- Missing details: state assumptions and say what to check next instead of blocking lifecycle guidance. + +## Exceptions + +- Exploratory datasets may use lighter checks if clearly labeled non-production. +- Some best-effort analytics can use follow-up tickets rather than urgent alerts if consumers accept delay. +- Streaming pipelines may use watermark/backlog SLIs instead of schedule-based freshness. +- Irreversible side effects during replay require quarantine and manual confirmation. + +## Response Quality Bar + +- Lead with the pipeline reliability target, blocker list, or replay plan requested. +- Cover freshness, completeness, correctness, lineage, replay, and quality checks before optional data-platform breadth. +- Make recommendations actionable with checks, stop conditions, and recovery actions where relevant. +- Name the details to inspect, such as row counts, watermarks, late-event rates, reconciliation checks, and backfill proofs; do not state details you have not seen. +- Stay technology-agnostic by default: do not introduce provider, product, framework, database, protocol, or command names unless the user supplied them or explicitly requested tool-specific guidance. +- Stay inside pipeline reliability unless the prompt explicitly asks for warehouse architecture or ownership controls. +- Be concise: avoid generic data-quality background and prefer compact SLI/check/replay tables. + +## Required Outputs + +- Pipeline SLI/SLO table. +- Dataset responsibility and lineage map. +- Validation and publish-check plan. +- Replay/backfill/reprocessing runbook. +- Freshness, backlog, error, and quality alert policy. +- Consumer impact and notification plan. +- Recovery test results or test plan. + +## Checks Before Moving On + +- `freshness_sli`: every critical dataset has freshness or watermark target and measurement source. +- `publish_check`: publish path has data-quality checks and failure behavior. +- `lineage_responsibility`: source, transform, and consumers are recorded. +- `replay_safety`: replay/backfill is idempotent or duplicate/side-effect risk is controlled. +- `recovery_runbook`: stalled, bad, or late data has recovery steps and consumer communication path. + +## Red Flags - Stop And Rework + +- Alerting only checks whether the job process exited. +- Published data has no validation before consumers read it. +- Backfill can duplicate downstream side effects. +- A dataset used by production decisions has no freshness target, lineage, or replay path. +- Lineage is reconstructed manually during every incident. + +## Common Mistakes + +| Mistake | Correction | +| --- | --- | +| Treating data pipelines as cron jobs | Treat them as services with SLIs, validation, and recovery paths. | +| Monitoring runtime only | Monitor freshness, completeness, correctness, and backlog. | +| Backfilling blindly | Make replay idempotent and validate output. | +| Publishing bad data fast | Check publish and quarantine failures. | diff --git a/plugins/sirmarkz/staff-engineer-mode/specialists/database-operations.md b/plugins/sirmarkz/staff-engineer-mode/specialists/database-operations.md new file mode 100644 index 00000000..e5b262c0 --- /dev/null +++ b/plugins/sirmarkz/staff-engineer-mode/specialists/database-operations.md @@ -0,0 +1,129 @@ +--- +name: database-operations +description: "Use when schema changes, backfills, indexes, destructive queries, query plans, locks, lag, throttles, or aborts matter" +--- + +# Database Operations And Schema Changes + +## Iron Law + +``` +NO PRODUCTION DATASTORE CHANGE WITHOUT LOCK/LAG ASSESSMENT, THROTTLE, ABORT, AND VERIFICATION +``` + +If you cannot pause, measure, and verify the change, it should not run against production state. + +## Overview + +Database changes are production releases with lock, lag, plan, and data-correction risk. + +**Core principle:** make schema, index, backfill, and maintenance changes observable, throttleable, verifiable, and reversible or forward-fixable. + +## When To Use + +- The user asks about online schema changes, index changes, production migrations, backfills, query-plan regressions, locks, replicas, compaction, vacuuming, or data maintenance. +- A data migration can affect latency, availability, data correctness, or rollback. +- A cleanup or destructive change touches production data. +- Query behavior changed after release or index/schema modification. + +## When Not To Use + +- The question is abstract storage or consistency choice; use `distributed-data-and-consistency` instead. +- The request is primarily about splitting a data model across databases, shards, or mutation boundaries; use `distributed-data-and-consistency` instead. +- The request is general rollout sequencing without database risk; use `progressive-delivery` instead. +- The primary concern is recovery after corruption or destructive change; use `backup-and-recovery` instead. +- The work is warehouse/ETL freshness; use `data-pipeline-reliability` instead. + +## Info To Gather + +- Current work phase, next decision, what is known, and assumptions where details are missing. +- Datastore type, topology, table/collection size, write rate, read patterns, and critical queries. +- Whether the datastore is on a user-critical path, with failover mode, connection limits, query tail latency, restore readiness, and write behavior during failover. +- Proposed DDL/DML, index, backfill, cleanup, or maintenance operation. +- Lock behavior, replication lag, write amplification, query-plan risks, and operational windows. +- Backfill batch size, throttle rules, pause/abort controls, checkpointing, and idempotency. +- Verification queries, counts, checksums, invariants, and sampled correctness checks. +- Rollback versus forward-fix options, backup/restore test results, and destructive cleanup delay. +- Monitoring: latency, errors, lock waits, lag, slow queries, saturation, job progress, and user impact. + +## Workflow + +1. **Classify the change.** Separate additive schema, index, backfill, dual-write, cutover, cleanup, query-plan, and maintenance work. +2. **Assess production risk.** Identify locks, lag, write amplification, query-plan shifts, shard/partition effects, cache churn, failover interactions, and whether user-critical paths depend on the datastore behavior during those conditions. +3. **Use expand/contract in named phases.** Run schema evolution as four sequential phases — Expand (add the new structure, old code ignores it), Migrate (backfill data into the new structure), Transition (new code reads/writes both), Contract (remove the old structure once nothing references it). Each phase except Contract is rollback-safe on its own: a failed Expand drops the new structure, a failed Migrate leaves the old structure authoritative with the new partially populated, a failed Transition reverts code while the old structure still serves; a failed Contract has already validated everything, so investigate before retrying rather than rolling back. +4. **Throttle and checkpoint.** Run in small batches with pause/abort controls, progress tracking, idempotency, and load-sensitive throttles. +5. **Validate data.** Use verification queries, invariant checks, counts, sampling, and reconciliation before declaring completion. +6. **Delay destructive cleanup.** Keep rollback/forward-fix options until telemetry shows the new path is stable. +7. **Monitor during rollout.** Watch user symptoms, query latency, error rate, locks, lag, saturation, and job health. +8. **Document recovery.** State rollback, forward-fix, restore, and manual repair options before running. + +## Synthesized Default + +Use compatible expand/contract migrations, throttled idempotent backfills, explicit abort criteria, delayed destructive cleanup, and verification queries. Treat database operations as release events with telemetry, user confirmation for risky steps, and rollback checks; include partitioning and shard-map effects when data placement changes. + + + +## Phase Behavior + +- Ideation: identify risks, defaults, unknowns, options, and the next decision before code exists. +- Design: shape the target artifact, tradeoffs, checks, and details to gather. +- Development: guide sequencing, code boundaries, checks, and acceptance criteria. +- Testing: define release-blocking tests, evals, fixtures, and failure probes. +- Release: define rollout, observability, abort, rollback, and readiness details. +- Maintenance: define owners, drift checks, cleanup triggers, and refresh cadence. +- Existing artifact: use current code, docs, telemetry, incidents, or diffs as context for the next engineering decision; do not wait for a finished artifact before guiding design, build, release, or operation. +- Missing details: state assumptions and say what to check next instead of blocking lifecycle guidance. + +## Exceptions + +- Small low-risk changes may run directly if lock/lag behavior is understood and rollback is simple. +- Destructive changes require backup/restore confidence and delayed cleanup unless data is provably disposable. +- Query-plan regressions may require emergency mitigation before a full migration plan, but details and follow-up remain required. +- Engine-specific mechanisms can be used, but the skill should express the required capability, not prescribe a product. + +## Response Quality Bar + +- Lead with the migration safety decision, blockers, or execution plan requested. +- Cover locks, query plans, backfill throttling, replication lag, verification, and rollback before optional database topics. +- Make recommendations actionable with checks, stop conditions, and rollback or pause criteria where relevant. +- Name the details to inspect, such as table size, write rate, lock behavior, replica lag, batch metrics, and validation queries; do not state details you have not seen. +- Stay technology-agnostic by default: do not introduce provider, product, framework, database, protocol, or command names unless the user supplied them or explicitly requested tool-specific guidance. +- Stay inside database change execution. Route broader distributed consistency only when semantic consistency is unresolved. +- Be concise: avoid generic database background and prefer compact phased runbooks. + +## Required Outputs + +- Database change plan with phases, confirmation points, and rollback checks. +- Lock, lag, write-amplification, and query-plan risk assessment. +- Critical-path database risk table covering failover, connection limits, query tail latency, restore readiness, and write behavior. +- Backfill or maintenance runbook with throttle, pause, abort, and checkpointing. +- Verification query/invariant plan. +- Monitoring and alert additions for the change window. +- Rollback or forward-fix decision record. +- Cleanup plan with delay, and check. + +## Checks Before Moving On + +- `lock_lag_check`: lock behavior, replication lag, and write amplification are assessed. +- `db_critical_path`: database behavior on user-critical paths is assessed for failover, connection limits, query tail latency, restore readiness, and write behavior. +- `throttle_abort`: batch size, throttle, pause, abort, and confirmation point are defined. +- `verification_check`: data correctness verification queries or invariants exist. +- `rollback_check`: rollback or forward-fix path is written before execution. +- `cleanup_delay`: destructive cleanup is delayed until cutover is verified. + +## Red Flags - Stop And Rework + +- A migration runs as one unbounded transaction or job. +- Verification is "job completed" without data correctness checks. +- Destructive cleanup happens before old and new paths have been compared. +- Query plans are assumed unchanged after index/schema changes. +- There is no clear pause or abort mechanism. + +## Common Mistakes + +| Mistake | Correction | +| --- | --- | +| Treating migrations as developer chores | Treat them as production releases. | +| Backfilling too fast | Throttle by user impact, lag, locks, and saturation. | +| Trusting row counts only | Add invariants, sampling, and reconciliation. | +| Removing old paths immediately | Delay cleanup until rollback is no longer needed. | diff --git a/plugins/sirmarkz/staff-engineer-mode/specialists/dependency-and-code-hygiene.md b/plugins/sirmarkz/staff-engineer-mode/specialists/dependency-and-code-hygiene.md new file mode 100644 index 00000000..0c1dadc8 --- /dev/null +++ b/plugins/sirmarkz/staff-engineer-mode/specialists/dependency-and-code-hygiene.md @@ -0,0 +1,125 @@ +--- +name: dependency-and-code-hygiene +description: "Use when dependency updates, dead-code removal, lockfile sweeps, codemods, or static-analysis ratchets need planning" +--- + +# Dependency Hygiene And Code Health + +## Iron Law + +``` +NO MAINTENANCE CHANGE WITHOUT SCOPE, REVERSIBILITY, AND NON-REGRESSION CHECKS +``` + +If a cleanup cannot be understood, tested, rolled back, or bounded, it is not hygiene; it is uncontrolled refactoring. + +## Overview + +Code health is maintained by routine, reversible, low-drama maintenance, not by occasional heroic cleanup. + +**Core principle:** keep dependencies, static findings, dead code, and refactors in small maintained batches with rollback, verification, and non-regression rules. + +## When To Use + +- The user asks about dependency updates, lockfiles, package deprecations, stale libraries, dead code, static-analysis backlog, codemods, or cleanup. +- You need recurring maintenance rules that do not block feature delivery. +- Existing warnings or findings need a ratchet so new debt is prevented while old debt is reduced. +- A mechanical refactor or dead-code removal needs safe execution. + +## When Not To Use + +- The main topic is build provenance, artifact signing, dependency inventory, builder isolation, or deployment admission; use `software-supply-chain-security` instead. +- The issue is an actively exploitable deployed vulnerability with SLA; use `vulnerability-management` instead. +- The refactor changes architecture boundaries; use `architecture-decisions` instead. +- The question is broad CI check strategy (test selection, coverage, mutation); use `testing-and-quality-gates` instead. Dependency-vulnerability scanning at PR/release time with a severity-blocking threshold remains in scope here. + +## Info To Gather + +- Current work phase, next decision, what is known, and assumptions where details are missing. +- Dependency inventory, direct/transitive responsibility, lockfiles, update cadence, and deprecated packages. +- Current vulnerable or outdated dependencies, runtime exposure, and patch urgency. +- Static-analysis findings, warning budgets, suppression rules, and existing baseline. +- Dead code candidates, usage telemetry, responsibility, and rollback plan. +- Codemod/refactor scope, generated changes, test coverage, and validation strategy. +- Release flow, canary options, and rollback capability for maintenance changes. + +## Workflow + +1. **Classify the work.** Separate routine updates, urgent patches, deprecations, static findings, dead code, codemods, and architecture-changing refactors. +2. **Batch conservatively.** Keep updates small enough to understand and roll back; separate risky runtime dependencies from safe dev-only updates. +3. **Preserve reproducibility.** Update lockfiles or equivalent pinned inputs intentionally and inspect transitive changes. +4. **Use risk-aware cadence.** Apply routine updates regularly; keep enough dependency inventory to identify affected deployed artifacts; treat active vulnerabilities as vulnerability-management work. +5. **Ratchet legacy findings.** Prevent new high-severity findings while gradually reducing the baseline. +6. **Confirm dead code is dead.** Use references, telemetry, responsibility confirmation, and staged deletion where risk is real. +7. **Execute codemods safely.** Check the pattern, sample output, affected responsibility, and validation results before broad application. +8. **Route trust controls.** If provenance, signing, or build trust becomes central, switch to supply-chain security. + +## Synthesized Default + +Use continuous small-batch maintenance with pinned inputs, dependency inventory, automated update proposals, small diffs, static-analysis ratchets, and reversible codemods. Treat routine hygiene separately from supply-chain integrity and deployed vulnerability remediation. + + + +## Phase Behavior + +- Ideation: identify risks, defaults, unknowns, options, and the next decision before code exists. +- Design: shape the target artifact, tradeoffs, checks, and details to gather. +- Development: guide sequencing, code boundaries, checks, and acceptance criteria. +- Testing: define release-blocking tests, evals, fixtures, and failure probes. +- Release: define rollout, observability, abort, rollback, and readiness details. +- Maintenance: define owners, drift checks, cleanup triggers, and refresh cadence. +- Existing artifact: use current code, docs, telemetry, incidents, or diffs as context for the next engineering decision; do not wait for a finished artifact before guiding design, build, release, or operation. +- Missing details: state assumptions and say what to check next instead of blocking lifecycle guidance. + +## Exceptions + +- Emergency security updates can bypass normal batching when active exploitation risk dominates; record follow-up cleanup. +- Large mechanical codemods are acceptable when the pattern is checked, output is sampled, and validation is automated. +- Abandoned packages may require migration planning rather than direct update. +- Dead-code deletion in rarely used paths may require staged disablement before removal. + +## Response Quality Bar + +- Lead with the maintenance plan, risk classification, or rollback-safe execution path requested. +- Cover scope, reversibility, pinned inputs, ratchets, dead-code signals, codemod validation, and scope boundaries before optional hygiene topics. +- Make recommendations actionable with batches, validation commands, non-regression checks, stop criteria, and rollback or staged-disable steps where relevant. +- Name the details to inspect, such as dependency inventory, lockfile diffs, transitive changes, static baselines, usage telemetry, and sample codemod output; do not state details you have not seen. +- Stay technology-agnostic by default: do not introduce provider, product, framework, database, protocol, or command names unless the user supplied them or explicitly requested tool-specific guidance. +- Stay inside dependency hygiene and code health. Route provenance/signing or actively exploited vulnerabilities only when they are the primary issue. +- Be concise: avoid generic cleanup advice and prefer compact batch plans, ratchet tables, and validation checklists. + +## Required Outputs + +- Dependency update rules and cadence. +- Dependency-vulnerability scan integrated into PR/release CI, with the severity threshold that blocks merge or promotion. +- Lockfile or pinned-input inspection rule. +- Deprecated package and migration plan. +- Static-analysis backlog ratchet and suppression rules. +- Dead-code cleanup plan with usage signals and rollback. +- Codemod/refactor plan with scope, validation, and responsibility. +- Selection rules to vulnerability management or supply-chain security. + +## Checks Before Moving On + +- `scope_check`: maintenance work is classified and bounded. +- `reversibility_check`: update, cleanup, or codemod has rollback or staged disablement plan. +- `lockfile_check`: pinned input changes are inspected intentionally. +- `ratchet_check`: legacy findings have non-regression rule and reduction step. +- `route_check`: provenance/signing/dependency-inventory and deployed vulnerability work are routed to the correct specialist. + +## Red Flags - Stop And Rework + +- A huge dependency bump mixes runtime libraries, toolchain changes, and unrelated refactors. +- Lockfile changes are treated as noise. +- Static-analysis warnings are either all ignored or all made blocking overnight. +- Dead code is removed without checking dynamic use, responsibility, or rollback. +- Routine dependency hygiene is confused with artifact provenance or signing. + +## Common Mistakes + +| Mistake | Correction | +| --- | --- | +| Big-bang cleanup | Use small batches and ratchets. | +| Treating all dependencies alike | Separate runtime, build-time, test-only, and transitive risk. | +| Blind codemods | Check the transform, sample output, responsibility, and validation. | +| Suppression sprawl | Require reason, expiry, or baseline rule. | diff --git a/plugins/sirmarkz/staff-engineer-mode/specialists/dependency-resilience.md b/plugins/sirmarkz/staff-engineer-mode/specialists/dependency-resilience.md new file mode 100644 index 00000000..6f8f752e --- /dev/null +++ b/plugins/sirmarkz/staff-engineer-mode/specialists/dependency-resilience.md @@ -0,0 +1,136 @@ +--- +name: dependency-resilience +description: "Use when designing or changing remote calls or queues needing timeouts, retries, idempotency, or overload controls" +--- + +# Dependency Resilience And Overload + +## Iron Law + +``` +NO REMOTE CALL OR QUEUE WITHOUT TIMEOUT, RETRY, IDEMPOTENCY, AND OVERLOAD POLICY +``` + +If any dependency can wait forever, retry forever, queue forever, or fail ambiguously, the design is not production-safe. + +## Overview + +Most cascading failures are dependency failures amplified by callers. + +**Core principle:** every remote interaction needs a deadline, retry budget, idempotency story, overload behavior, and observable failure mode. + +## When To Use + +- The user is designing, building, adding, or modifying RPC, HTTP, database, cache, broker, stream, queue, webhook, or third-party calls. +- The user asks about retries, timeouts, backoff, jitter, circuit breakers, bulkheads, idempotency, backpressure, health checks, or load shedding. +- A service degrades when a dependency is slow, overloaded, unavailable, or returning errors. +- Queue depth, age, retries, or fanout can amplify failures. + +## When Not To Use + +- The request is only about in-process exceptions or validation. +- The question is where a service, module, or worker boundary should own responsibility; use `architecture-decisions` instead. +- The main question is SLO target policy; use `slo-and-error-budgets` instead. +- The main issue is topology and fault-domain survival; use `high-availability-design` instead. +- The problem is p99 optimization without dependency safety changes; use `performance-and-capacity` instead. + +## Info To Gather + +- Current work phase, next decision, what is known, and assumptions where details are missing. +- Dependency matrix: caller, callee, operation, protocol, tier, and criticality. +- User impact if the dependency is slow or unavailable, caller-side dependency signals, and startup or scale behavior when runtime dependencies are unavailable. +- End-to-end request deadline, per-hop timeout, connection timeout, and cancellation behavior. +- Retry count, retry locations, backoff, jitter, retryable status codes/errors, adaptive retry budget, and overload signals that stop retries. +- Mutation idempotency: idempotency key, dedupe window, side effects, and replay behavior. +- Queue limits: max depth, age, drain rate, consumer concurrency, poison message handling, and DLQ policy. +- Overload signals: saturation, errors, latency, admission decisions, rejected work, and load-shed responses. +- Health checks: liveness, readiness, startup, dependency probes, and failure thresholds. + +## Workflow + +1. **Build the dependency matrix.** Include synchronous and asynchronous dependencies, third parties, control planes, shared infrastructure, user impact if failed, and caller-side metrics for latency, errors, timeouts, retries, and rejected work. +2. **Set the caller deadline.** Define the total time budget from the user's perspective, then allocate per-hop timeouts inside it. Calibrate each per-hop timeout from the downstream's measured tail latency (e.g., p99.9) with a target false-timeout rate ≤0.1%; do not infer timeouts from average latency. +3. **Bound retries.** Retry only when the operation is safe, useful, inside the deadline, jittered, and at one layer only — chained per-layer retries multiply load geometrically (three tries per layer across five layers is 243× load on the deepest dependency). Default to at most one retry on synchronous request-response paths; allow more on asynchronous or batch work with backoff and a dead-letter terminus. Enforce the retry rate with a token-bucket budget that replenishes on healthy responses and drains under systemic failure; do not retry explicit overload signals. +4. **Make mutations idempotent.** Require idempotency keys or durable dedupe for retryable writes, webhooks, and queue consumers. +5. **Handle partial batch outcomes.** If a batch call partially succeeds, retry only the failed or unknown items and preserve per-item correlation. +6. **Control queues.** Set max depth, max age, drain-rate alerts, poison handling, and backpressure before backlogs become unrecoverable. +7. **Smooth mismatched rates.** When callers can outpace dependencies, use durable buffering, controlled workers, and rate limits instead of unbounded memory queues. Size each per-dependency thread or connection pool from Little's Law as a starting estimate — `peak accepted TPS × chosen latency-or-timeout-seconds × safety factor` — then verify against pool wait time and saturation rather than treating the formula as definitive. +8. **Design overload response.** Prefer fail-fast, admission control, load shedding, and priority shedding before expensive work starts. When ordering semantics permit, prefer LIFO over FIFO under overload so newer requests are more likely still useful; propagate remaining-deadline hints transitively between hops so downstream services know when to stop. New isolation or admission limits should ship in observe-only mode first to confirm the threshold matches reality, then move to enforcement. Shed requests must remain visible in reject, shed, and error-budget metrics — exclude them only from latency percentiles, otherwise tail regression hides while the system silently fails. +9. **Use circuit breakers carefully.** For limiting retry-induced load, prefer a token-bucket retry budget over a breaker — it bounds aggregate retry rate without modal flapping. If a breaker is needed for primary-call protection, prefer additive-increase / multiplicative-decrease over binary open/closed; binary breakers oscillate under partial failure and add a rarely exercised failure mode. Name the threshold, half-open probe policy, close/recovery condition, and user-visible behavior while open. +10. **Keep health checks local.** Liveness probes must be shallow — no dependency calls — because a liveness probe that calls a shared dependency triggers cascading restarts the moment that dependency slows. Readiness may check immediate dependencies only when that cannot remove all capacity at once. Reserve enough local capacity (or an admission-bypass path) for cheap health-check responses to remain answerable while the rest of the service sheds overload — otherwise the orchestrator marks healthy instances dead during the exact incident the checks are supposed to survive. +11. **Keep startup independent where possible.** A restart, deploy, or scale-out path should not need every runtime dependency to be healthy unless the user-visible behavior, retry policy, and fallback are explicit. + +## Synthesized Default + +Use bounded timeouts/retries with jitter, idempotent APIs, adaptive retry budgets, rate limiting, queue backpressure, and load shedding as the default. Retry only transient conditions inside the caller deadline and retry budget; do not retry permanent failures, overload signals, or already-successful batch items unless the contract explicitly says to. Treat circuit breakers as an exception mechanism, not the first tool. Avoid fallback unless the fallback is simpler, isolated, capacity-tested, and observably correct under the same dependency failure. + + + +## Phase Behavior + +- Ideation: identify risks, defaults, unknowns, options, and the next decision before code exists. +- Design: shape the target artifact, tradeoffs, checks, and details to gather. +- Development: guide sequencing, code boundaries, checks, and acceptance criteria. +- Testing: define release-blocking tests, evals, fixtures, and failure probes. +- Release: define rollout, observability, abort, rollback, and readiness details. +- Maintenance: define owners, drift checks, cleanup triggers, and refresh cadence. +- Existing artifact: use current code, docs, telemetry, incidents, or diffs as context for the next engineering decision; do not wait for a finished artifact before guiding design, build, release, or operation. +- Missing details: state assumptions and say what to check next instead of blocking lifecycle guidance. + +## Exceptions + +- Some read-only idempotent requests can use hedging for tail latency, but only with capacity accounting and duplicate suppression where needed. +- A circuit breaker is appropriate when repeated calls make the outage worse and the open state has a tested user behavior. +- A fallback is acceptable when it is stale, cached, local, or reduced-quality by design, and does not depend on the same failing system. +- Non-critical asynchronous work may be dropped or delayed if loss semantics are explicit. + +## Response Quality Bar + +- Lead with the dependency risk, timeout/retry budget, overload policy, or failure-mode plan requested. +- For short design answers, still include concrete values or placeholders for per-dependency timeout, retry count/backoff/idempotency, circuit-breaker open/half-open/recovery thresholds, and the degraded user behavior. +- Cover deadlines, retry safety, idempotency, backpressure, load shedding, health checks, fallbacks, and failure tests before optional resilience breadth. +- Make recommendations actionable with thresholds, budgets, queue limits, stop criteria, tests, and rollback or disablement steps where relevant. +- Name the details to inspect, such as dependency p95/p99 latency, error classes, retry counts, queue age, saturation, health-check behavior, and failure-test results; do not state details you have not seen. +- Stay technology-agnostic by default: do not introduce provider, product, framework, database, protocol, or command names unless the user supplied them or explicitly requested tool-specific guidance. +- Stay inside dependency resilience and overload. Route API contract or capacity-model work only when it materially blocks the failure-mode decision. +- Be concise: avoid generic retry guidance and prefer compact dependency matrices and budget tables. + +## Required Outputs + +- Dependency matrix with operation, protocol, criticality, and failure behavior. +- Caller-side dependency signals and startup/scale behavior for unavailable runtime dependencies. +- Timeout/deadline budget table for caller and each dependency. +- Retry policy with backoff, jitter, retryable conditions, overload stop signals, and retry budget. +- Idempotency and duplicate-handling plan for mutations and consumers. +- Queue/backpressure/load-shedding policy with thresholds. +- Circuit-breaker or fail-fast policy for sustained failures, including open threshold, half-open probe policy, close/recovery condition, and behavior while open. +- Health-check design separating liveness, readiness, startup, and dependency checks. +- Failure-mode tests or experiments for slow, erroring, overloaded, and unavailable dependencies. + +## Checks Before Moving On + +- `dependency_matrix`: every remote dependency and queue has timeout, retry, and failure behavior. +- `deadline_budget`: per-hop timeouts fit inside the end-to-end caller deadline. +- `retry_safety`: retryable calls, mutations, batch items, and consumers have retry budgets plus idempotency or dedupe behavior. +- `overload_bound`: queues are bounded and overload behavior is observable before saturation cascades. +- `health_check_safety`: health checks cannot remove the whole fleet because a shared dependency is unhealthy. +- `caller_side_signals`: dependency health is visible from the caller side for latency, errors, timeouts, retries, and rejected work. +- `startup_independence`: restart, deploy, or scale-out behavior under dependency unavailability is defined. + +## Red Flags - Stop And Rework + +- Retrying at client, gateway, service, SDK, and worker layers with no budget. +- Retrying downstream overload signals or already-successful batch items. +- Timeout values are absent, default, infinite, or longer than the caller's deadline. +- A queue has max depth but no max age, drain-rate alert, DLQ, or poison-message policy. +- Health checks call a shared dependency and mark all instances unavailable at once. +- Fallback is more complex than the primary path or shares the same failing dependency. + +## Common Mistakes + +| Mistake | Correction | +| --- | --- | +| Adding retries to fix slowness | First set deadlines and understand capacity; retries add load. | +| Treating circuit breakers as magic | Define and test the open, half-open, and recovery behavior. | +| Ignoring idempotency | Make retryable writes duplicate-safe before enabling retries. | +| Letting queues absorb everything | Bound queues and shed, delay, or reject work deliberately. | diff --git a/plugins/sirmarkz/staff-engineer-mode/specialists/dev-environment-parity.md b/plugins/sirmarkz/staff-engineer-mode/specialists/dev-environment-parity.md new file mode 100644 index 00000000..c3f829ab --- /dev/null +++ b/plugins/sirmarkz/staff-engineer-mode/specialists/dev-environment-parity.md @@ -0,0 +1,154 @@ +--- +name: dev-environment-parity +description: "Use when local, CI, staging, or production drift causes parity failures across config, data, network, or secrets" +--- + +# Dev Environment Parity + +## Iron Law + +``` +NO FIX THAT WORKS ONLY LOCALLY COUNTS AS FIXED +``` + +A change that passes locally but fails in CI, passes in CI but fails in staging, or passes in staging but fails in production is unfinished work. The drift that hid the failure is the real defect. + +## Overview + +Produces a parity matrix across local, CI, staging, and production for the dimensions that decide whether a fix carries: dependency versions, configuration, data shape, time and clock behavior, network policy, and secret handling. Produces a drift-detection plan, a defined drift budget with action triggers, and a required-parity-versus-allowed-divergence taxonomy. Refuses to call a change shipped when it works only in the environment it was written in. + +**Core principle:** environments are a contract. Allowed divergence is named, bounded, and monitored; unnamed divergence is the bug that hides until the worst possible moment. + +## When To Use + +- The user is designing or relying on local, CI, staging, preview, or production-like environments and needs to decide which differences are allowed. +- The user reports a "works on my machine" failure or a green-CI-but-broken-staging failure. +- A migration, dependency update, or configuration change behaves differently across environments and you need to know which differences matter. +- A new environment (preview, ephemeral, branch-per-developer) is being introduced and you need to define how closely it must match the others. +- You are moving to ephemeral preview environments and needs a parity contract before relying on them as a release check. +- An incident's root cause was a divergence (different library version, different timezone, different network egress) and you want to prevent the next one. +- A new contributor's local setup keeps producing diffs that fail CI for environment reasons rather than logic reasons. +- An AI coding agent is editing in an environment whose parity to CI or production is undeclared, and its diffs pass locally but break elsewhere. + +## When Not To Use + +- The work is producing reproducible release artifacts (pinned inputs, hermetic build, signed promotion); use `release-build-reproducibility`. +- The work is declaring infrastructure desired state, drift reconciliation against that desired state, or admission policy; use `infrastructure-and-policy-as-code`. +- The work is platform templates, golden paths, or a service catalog; use `platform-golden-paths`. +- The work is configuration safety in a single environment (validation, preview, blast radius, rollback); use `configuration-and-automation-safety`. +- The work is secret rotation, key management, or workload identity; use `identity-and-secrets`. +- The work is internal service mesh, discovery, or routing; use `internal-service-networking`. +- The work is an active production incident whose triage cannot wait for parity analysis; use `incident-response-and-postmortems`. + +## Info To Gather + +- Current work phase, next decision, what is known, and assumptions where details are missing. +- Environment inventory: local developer machines, CI runners, ephemeral or preview environments, staging, production, and any tier in between, with the person or script responsible for changing each environment. +- Dependency manifest per environment: language runtime version, system library versions, package lockfile state, and how each environment resolves them. +- Configuration manifest per environment: feature flags, environment variables, defaults, overrides, and the rule for how production-like each non-prod environment is. +- Preflight stage parity for critical release paths: configuration, dependencies, network, data shape, policy, and traffic-relevant limits. +- Data-shape manifest: schema versions, sample data sources, and whether non-prod data shape (cardinality, distribution, size) resembles production for the paths under test. +- Time and clock behavior: timezone, locale, NTP configuration, and any code paths that depend on wall-clock or monotonic time. +- Network policy: egress allowlists, ingress filters, DNS resolution, internal service reachability, and outbound rate or timeout differences across environments. +- Secret handling: how secrets are injected, scoped, and rotated per environment, and whether a non-prod environment has access to production-scope secrets. +- Recent incidents whose root cause was an environment divergence and the dimension responsible. +- Existing drift signals or detectors and their false-positive rate. + +## Workflow + +1. **Inventory the environments.** Name every environment a developer or CI uses, its purpose, who can change it, and the tier of confidence its results carry. +2. **Build the parity matrix.** For each environment pair, list the parity status across dimensions: dependency versions, configuration, data shape, time and clock, network policy, and secret handling. Mark each entry as required-parity, allowed-divergence (with reason), or unknown (a finding by itself). +3. **Define the required-parity dimensions.** Decide which dimensions must match across environments to keep test results meaningful. Dependency versions and configuration shape are usually required; production data values are usually forbidden in non-prod. +4. **Define the allowed-divergence dimensions.** Decide which dimensions are intentionally different and what the contract is: data scale, secret values, account identifiers, real third-party dependencies versus stand-ins, network egress scope. +5. **Set the drift budget.** State the acceptable size of divergence per dimension (for example, dependency-version skew within one minor version, configuration drift within a defined allowlist) and the action triggered when the budget is exceeded. +6. **Detect drift.** For each parity-required dimension, instrument a comparison: hash the dependency lock, snapshot the configuration, compare schema versions, compare clock and locale settings, compare network reachability matrices. Drift detection runs on a defined cadence, not only on incident. +7. **Set action triggers.** When drift exceeds the budget, the action is not "create a generic task." The action is named: block CI promotion, block deploy to the next environment, repair the environment contract, or open an incident-grade follow-up. +8. **Handle ephemeral and preview environments.** Ephemeral environments are useful only when their parity contract is explicit. State which dimensions they replicate from production and which they intentionally diverge on, so a passing preview means something specific. +9. **Define preflight parity.** For release preflight stages, state which critical path dimensions must match production closely enough for the result to be trusted. +10. **Bound third-party dependencies in non-prod.** Decide per dependency whether non-prod uses a stand-in, a sandbox, or the real production endpoint. Each choice has different parity properties; document them. +11. **Reproduce the failure across environments.** When a "works here, fails there" failure appears, the first action is to reproduce in each tier and identify the dimension responsible. The fix lives in that dimension, not only in the failing tier. +12. **Update the parity contract.** After every drift-related incident, update the matrix, the drift budget, or the detection so the same divergence cannot hide again. + +## Synthesized Default + +Define required parity and allowed divergence per dimension. Detect drift on parity-required dimensions on a defined cadence. Bound divergence with a budget and a named action when the budget is exceeded. Treat ephemeral and preview environments as parity-explicit, not parity-by-vibes. Reproduce environment-divergent failures in every tier before declaring a fix. Update the contract after every drift-rooted incident. + + + +## Phase Behavior + +- Ideation: identify risks, defaults, unknowns, options, and the next decision before code exists. +- Design: shape the target artifact, tradeoffs, checks, and details to gather. +- Development: guide sequencing, code boundaries, checks, and acceptance criteria. +- Testing: define release-blocking tests, evals, fixtures, and failure probes. +- Release: define rollout, observability, abort, rollback, and readiness details. +- Maintenance: define owners, drift checks, cleanup triggers, and refresh cadence. +- Existing artifact: use current code, docs, telemetry, incidents, or diffs as context for the next engineering decision; do not wait for a finished artifact before guiding design, build, release, or operation. +- Missing details: state assumptions and say what to check next instead of blocking lifecycle guidance. + +## Exceptions + +- A research or exploration environment may diverge intentionally; results from it cannot be used as release checks. +- A regulated workload may forbid production-realistic data in non-prod; the parity contract then privileges shape and schema over content, and the test data lives in a different fixture class. +- Performance and load environments may run on smaller capacity by design; the parity contract specifies which signals are still meaningful at reduced scale. +- A pre-production environment that exists only to exercise the rollout machinery may waive data and traffic parity, but must hold dependency, configuration, and policy parity. +- Local laptop environments may run a reduced subset of services with a documented stand-in policy, but a fix verified only against stand-ins is not yet a fix. + +## Response Quality Bar + +- Lead with the parity matrix, drift budget, drift-detection plan, allowed-divergence taxonomy, or environment-failure reproduction requested. +- When diagnosing a "passes here, fails there" failure, name the anti-pattern in plain language ("the local pass is a mocked happy-path result", "the fix is environment-only and does not count as shipped") AND name the enforcement that would have caught it (CI route-coverage check, readiness check, lint, readiness checklist). Do not let the structured matrix replace the verdict. +- Cover dependencies, configuration, data shape, time and clock, network policy, and secret handling before optional environment breadth. +- Make recommendations actionable with per-dimension parity status, drift budget, detection cadence, action trigger, and the environment change path. +- Name the details to inspect, such as dependency-lock comparisons, configuration snapshots, schema versions, clock settings, network reachability checks, and the drift signals that fired or did not fire; do not state parity without the comparison. +- Stay technology-agnostic by default: do not introduce provider, product, framework, database, protocol, or command names unless the user supplied them or explicitly requested tool-specific guidance. +- Stay inside running-environment parity. Route release-artifact reproducibility, infrastructure desired state, platform templates, single-environment configuration safety, secret lifecycle, internal mesh, and incident command to the responsible specialist. +- Be concise: prefer compact parity matrices and budget tables over generic environment-management prose. + +## Required Outputs + +- Parity matrix across local, CI, ephemeral or preview, staging, and production for each dimension (dependencies, configuration, data shape, time and clock, network policy, secret handling) with required-parity, allowed-divergence, or unknown per entry. +- Required-parity-versus-allowed-divergence taxonomy: which dimensions must match, which may diverge with reason, and which are forbidden in non-prod. +- Drift budget per dimension with the size of acceptable divergence and the named action when exceeded. +- Drift-detection plan listing the comparison method, cadence, source of truth, and change path per dimension. +- Action-trigger table mapping each drift-budget breach to the action taken (block CI promotion, block deploy, repair environment contract, open follow-up). +- Ephemeral and preview environment contract stating replicated and diverged dimensions and what a passing run in those environments means. +- Preflight parity matrix for critical release paths, including which results are meaningful when intentional divergence remains. +- Third-party dependency stand-in policy per dependency with the parity properties of each choice. +- Reproduction protocol for "works here, fails there" failures with the order of tiers to reproduce in and the dimension-isolation steps. +- Follow-up routes to release reproducibility, infrastructure-as-code, platform paths, configuration safety, identity, internal networking, or incident response as needed. + +## Checks Before Moving On + +- `environment_inventory_present`: every environment a developer or CI uses is named with change path and tier of confidence. +- `parity_matrix_present`: parity status is recorded per dimension per environment pair; unknowns are listed as findings. +- `divergence_taxonomy`: required-parity, allowed-divergence with reason, and forbidden combinations are explicit. +- `drift_budget_set`: each parity-required dimension has a numeric or categorical budget and a named action when exceeded. +- `drift_detection_active`: each parity-required dimension has an active comparison with cadence and change path. +- `action_triggers_named`: drift-budget breaches map to specific actions, not generic ticket creation. +- `preflight_environment_match`: release preflight stages match the production dimensions needed for critical-path confidence or state the limits of the result. +- `ephemeral_contract`: preview and ephemeral environments declare replicated and diverged dimensions explicitly. +- `reproduction_protocol`: a documented order and method for reproducing environment-divergent failures across tiers exists and is used. + +## Red Flags - Stop And Rework + +- A fix is declared shipped because it passes locally and the next environment is "probably fine." +- Dependency versions, configuration, or schema differ across environments and no comparison runs. +- Ephemeral or preview environments are treated as production-equivalent without a stated parity contract. +- Production-scope secrets are accessible from a non-prod environment with no recorded reason. +- A drift detector exists but the action on breach is "send an email." +- An incident's root cause was an environment divergence and the parity contract was not updated. +- Time, locale, or clock differences across environments are unmeasured even after a date- or timezone-related bug. +- Stand-in dependencies in non-prod produce different success contracts than the real dependency in production and nobody has documented the gap. + +## Common Mistakes + +| Mistake | Correction | +| --- | --- | +| "Works on my machine" closes the ticket | Reproduce across tiers; the fix lives in the diverged dimension. | +| Drift treated only as developer annoyance | Set a drift budget with a named action when exceeded. | +| Ephemeral environments trusted by default | State the parity contract; results count only for replicated dimensions. | +| Secrets shared across tiers for convenience | Scope secrets per environment; document any cross-tier exception with user-confirmed reason and expiry. | +| Detection without action | Map every breach to block, trigger an urgent alert, or open follow-up. | +| Divergence considered only on incident | Compare parity-required dimensions on a defined cadence. | +| Treating data shape and data values the same | Forbid production values in non-prod; require shape parity for the paths under test. | diff --git a/plugins/sirmarkz/staff-engineer-mode/specialists/distributed-data-and-consistency.md b/plugins/sirmarkz/staff-engineer-mode/specialists/distributed-data-and-consistency.md new file mode 100644 index 00000000..347174fa --- /dev/null +++ b/plugins/sirmarkz/staff-engineer-mode/specialists/distributed-data-and-consistency.md @@ -0,0 +1,124 @@ +--- +name: distributed-data-and-consistency +description: "Use when storage choice, database splits, sharding, transactions, consistency, locks, conflicts, or failover matter" +--- + +# Distributed Data And Consistency + +## Iron Law + +``` +NO READ OR WRITE PATH WITHOUT NAMED CONSISTENCY, CONFLICT, AND FAILOVER BEHAVIOR +``` + +For each read path and each write path, the design must say which consistency guarantee holds, what happens to a conflicting concurrent write, and what users observe during failover or replication lag. "Eventually consistent" without saying what users see between events, or "transactional" without saying which operations span the boundary, is not a design. + +## Overview + +Data architecture starts with semantics, not storage brands. + +**Core principle:** choose storage, replication, transactions, consistency, and sharding from the correctness guarantees each operation actually needs. + +## When To Use + +- The user is designing or changing storage choice, replication, consistency, transactions, sharding, hot keys, data correctness, distributed locks, or data responsibility. +- A service boundary changes who is responsible for mutating data. +- The design needs to choose between strong, eventual, read-your-writes, monotonic, causal, or quorum-style behavior. +- The user asks whether stale reads, duplicate writes, or conflicts are acceptable. + +## When Not To Use + +- The request is only cache TTL, invalidation, stampede, or materialized-view operation; use `caching-and-derived-data` instead. +- The question is online schema/backfill execution; use `database-operations` instead. +- The work is service event choreography; use `event-workflows` instead. +- The request is warehouse/ETL freshness rather than application data correctness; use `data-pipeline-reliability` instead. + +## Info To Gather + +- Current work phase, next decision, what is known, and assumptions where details are missing. +- Data classes: money, authz, user settings, content, cache, derived state, analytics, notifications, access logs, or ML features. +- Operations: create, update, delete, read, list, search, reconcile, compensate, and repair. +- Correctness expectations: uniqueness, ordering, freshness, read-your-writes, conflict handling, idempotency, and durability. +- Access patterns, read/write volume, fanout, hot keys, tenant/shard routing, and growth forecast. +- Failure modes: partial writes, failover, replication lag, split brain, retries, duplicate leaders, and operator repair. +- Migration constraints, responsibility, change history needs, and backup/restore requirements. + +## Workflow + +1. **Classify data by consequence.** Financial, authorization, privacy, and audit data usually need stronger guarantees than analytics or derived views. +2. **Write operation semantics.** For each critical operation, define allowed staleness, conflict behavior, idempotency, and durability. +3. **Choose consistency deliberately.** Use the weakest guarantee that preserves correctness and user expectation; document the tradeoff. +4. **Avoid cross-service transactions.** Prefer local transactions plus outbox, sagas, reconciliation, or compensating actions over distributed two-phase commit. +5. **Plan partitioning early.** Choose shard/tenant keys, hot-key mitigations, locality needs, shard-map responsibility, resharding path, and responsibility boundaries. +6. **Treat locks and leaders as dangerous.** Use well-tested coordination primitives when necessary, and design work to be idempotent under duplicate execution. +7. **Define repair and verification.** Include reconciliation jobs, invariants, audit trails, and manual repair safety. +8. **Route operational changes.** Schema/backfill execution goes to database operations; cache mechanics go to caching. + +## Synthesized Default + +Default to the simplest storage and consistency model that satisfies operation semantics. Keep data responsibility local where possible, co-locate data that must transact together, use idempotency and durable state transitions, and avoid custom distributed coordination. When weaker consistency is chosen, state exactly what users may observe and how repair works. + + + +## Phase Behavior + +- Ideation: identify risks, defaults, unknowns, options, and the next decision before code exists. +- Design: shape the target artifact, tradeoffs, checks, and details to gather. +- Development: guide sequencing, code boundaries, checks, and acceptance criteria. +- Testing: define release-blocking tests, evals, fixtures, and failure probes. +- Release: define rollout, observability, abort, rollback, and readiness details. +- Maintenance: define owners, drift checks, cleanup triggers, and refresh cadence. +- Existing artifact: use current code, docs, telemetry, incidents, or diffs as context for the next engineering decision; do not wait for a finished artifact before guiding design, build, release, or operation. +- Missing details: state assumptions and say what to check next instead of blocking lifecycle guidance. + +## Exceptions + +- Financial, authorization, inventory, and destructive operations may require strong consistency or formal modeling. +- High-scale read paths may accept stale or derived reads when user impact and repair are explicit. +- Multi-step workflows across independent mutation boundaries should use sagas or reconciliation rather than pretending one atomic transaction exists. +- Distributed locks are acceptable only with a well-tested primitive, lease semantics, fencing or idempotency, and failure tests. + +## Response Quality Bar + +- Lead with the consistency decision, tradeoff, or unresolved blocker. +- Cover data semantics, stale-read impact, conflicts, failure behavior, and operational cost before optional distributed-systems breadth. +- Make recommendations actionable with checks, stop conditions, and validation criteria where relevant. +- Name the details to inspect, such as invariants, latency budgets, conflict rates, replication behavior, and failure assumptions; do not state details you have not seen. +- Stay technology-agnostic by default: do not introduce provider, product, framework, database, protocol, or command names unless the user supplied them or explicitly requested tool-specific guidance. +- Stay inside the data consistency decision. Mention caches, workflows, or schema execution only when they materially change semantics. +- Be concise: avoid generic CAP/PACELC exposition and prefer decision matrices. + +## Required Outputs + +- Data classification table. +- Operation-level consistency matrix. +- Storage decision record with rejected alternatives. +- Replication, failover, and conflict-resolution model. +- Sharding/hot-key/tenant-routing plan. +- Transaction, outbox, saga, or reconciliation plan. +- Correctness verification and repair plan. + +## Checks Before Moving On + +- `semantics_check`: every critical operation has freshness, ordering, idempotency, conflict, and durability semantics. +- `consistency_choice`: chosen guarantees are justified by user consequence and failure behavior. +- `responsibility_check`: every data class has an explicit mutation boundary and repair path. +- `partition_check`: shard/tenant key, hot-key risk, and resharding approach are addressed where scale requires it. +- `repair_check`: invariants, reconciliation, change history, or manual repair path exists for known inconsistency modes. + +## Red Flags - Stop And Rework + +- Storage is selected before data semantics are written. +- "Eventually consistent" is used without saying what users can observe or how conflicts repair. +- Distributed locks are hand-rolled. +- Hot keys or tenant skew are ignored for a high-scale path. +- Cross-service writes are described as atomic without a mechanism or compensation plan. + +## Common Mistakes + +| Mistake | Correction | +| --- | --- | +| One consistency level for everything | Decide per operation and data class. | +| Using caches to solve semantics | Decide stale-read semantics here, then route cache mechanics. | +| Ignoring repair | Define invariants, reconciliation, audit, and correction paths. | +| Treating sharding as later | At least identify shard keys and hot-key risks early. | diff --git a/plugins/sirmarkz/staff-engineer-mode/specialists/documentation-lifecycle.md b/plugins/sirmarkz/staff-engineer-mode/specialists/documentation-lifecycle.md new file mode 100644 index 00000000..47ab7bd3 --- /dev/null +++ b/plugins/sirmarkz/staff-engineer-mode/specialists/documentation-lifecycle.md @@ -0,0 +1,131 @@ +--- +name: documentation-lifecycle +description: "Use when runbooks, design docs, ADRs, or onboarding docs need owners, source of truth, or freshness rules" +--- + +# Engineering Documentation Lifecycle + +## Iron Law + +``` +NO CRITICAL ENGINEERING DOC WITHOUT AUDIENCE, SOURCE OF TRUTH, FRESHNESS RULE, AND CHANGE TRIGGER +``` + +If a doc can mislead an operator or steer a change, it needs an audience, source of truth, freshness rule, and change trigger before it is usable. + +## Overview + +Engineering documentation is useful only when it is findable, maintained, current, authoritative, and tied to the system it describes. + +**Core principle:** make docs part of the delivery system, with audience, freshness signal, source of truth, and change trigger. + +## When To Use + +- The user is designing, restructuring, or lifecycle-managing engineering docs, runbooks, design docs, decision records, onboarding guides, operational references, or documentation standards. +- The user asks to inventory stale docs or decide ownership, source of truth, freshness rules, verification cadence, or archive criteria. +- Documentation is stale, duplicated, missing hard to find, or disconnected from code and operations. +- A launch, migration, incident, or deprecation needs docs that remain accurate after the change lands. +- You need a doc lifecycle, not just copy editing. + +## When Not To Use + +- The main artifact is an architecture decision; use `architecture-decisions`. +- The main artifact is an incident timeline or postmortem; use `incident-response-and-postmortems`. +- The request is marketing, sales, or public positioning copy. +- The request is routine editorial or mechanical documentation maintenance with no source-of-truth dispute, operational guidance gap, stale-doc risk, or lifecycle decision. + +## Info To Gather + +- Current work phase, next decision, what is known, and assumptions where details are missing. +- Doc type, audience, source of truth, repo or system link, and user decision point. +- Current doc set, duplicates, stale pages, search paths, and missing operational references. +- Operational docs, runbooks, dashboard metric definitions, source of truth, freshness triggers, and alert/dashboard link health. +- Change triggers: code responsibility, service behavior, alerts, runbooks, interfaces, migrations, and deprecations. +- Verification cadence, freshness signal, archival rule, and exception path. +- Signs that users can find and apply the doc during real work. + +## Workflow + +1. **Classify docs by job.** Place every doc asset into exactly one quadrant: tutorial (learning-oriented), how-to (task-oriented), reference (information-oriented), or explanation (understanding-oriented). Tag runbooks and decision records separately as operational and architectural artifacts. Split or rewrite any doc that mixes quadrants until each piece sits in one. +2. **Name the audience.** State who uses the doc and what decision or task it supports. +3. **Assign responsibility.** Give every critical doc a user/agent responsibility path and an update trigger tied to the system lifecycle. Anonymous docs become stale silently. +4. **Pick the source of truth.** Remove or mark duplicates so readers know where authority lives. +5. **Add freshness signals.** Include last-verified state, lifecycle stage, change trigger, and archive rule; operational docs refresh after incidents, threshold changes, dependency changes, or readiness findings. +6. **Connect docs to delivery.** Link docs to code, alerts, dashboards, runbooks, release checks, or decision records where they are used. +7. **Test usability.** Verify a fresh agent or the user from a clean clone can find and follow the doc under realistic conditions. +8. **Retire stale docs.** Archive misleading content rather than keeping it searchable with no current source of truth. + +## Synthesized Default + +Use a lightweight documentation lifecycle: classify by user job, assign define source of truth, tie updates to system changes, add freshness signals, and archive stale material. Critical runbooks and launch docs should be checked as part of delivery, not after outages show they were wrong. + + + +## Phase Behavior + +- Ideation: identify risks, defaults, unknowns, options, and the next decision before code exists. +- Design: shape the target artifact, tradeoffs, checks, and details to gather. +- Development: guide sequencing, code boundaries, checks, and acceptance criteria. +- Testing: define release-blocking tests, evals, fixtures, and failure probes. +- Release: define rollout, observability, abort, rollback, and readiness details. +- Maintenance: define owners, drift checks, cleanup triggers, and refresh cadence. +- Existing artifact: use current code, docs, telemetry, incidents, or diffs as context for the next engineering decision; do not wait for a finished artifact before guiding design, build, release, or operation. +- Missing details: state assumptions and say what to check next instead of blocking lifecycle guidance. + +## Exceptions + +- Short-lived design notes may expire after the decision is recorded elsewhere. +- Exploratory notes can remain rough if clearly marked as non-authoritative. +- Emergency docs may start minimal but need cleanup immediately after the event. + +## Response Quality Bar + +- Lead with the doc lifecycle, inventory, rewrite plan, or freshness check requested. +- Cover audience, source of truth, doc type, update trigger, discoverability, and archival rule before optional style advice. +- Make recommendations actionable with verification cadence, stale-doc handling, and delivery checks where relevant. +- Name the details to inspect, such as current docs, usage paths, responsibility paths, stale pages, runbook tests, and change triggers; do not state details you have not seen. +- Stay technology-agnostic by default: do not introduce provider, product, framework, database, protocol, or command names unless the user supplied them or explicitly requested tool-specific guidance. +- Stay inside engineering documentation. Route architecture decisions, incident writeups, or marketing copy only when they are central. +- Be concise: prefer doc inventories and lifecycle rules over broad writing theory. + +## Required Outputs + +- Documentation inventory **table with explicit columns**: `Doc | Diátaxis quadrant (tutorial / how-to / reference / explanation) | Responsibility path | Source of truth | Last verified | Verification cadence | Staleness signal`. Runbooks and decision records tagged separately as operational/architectural. +- Source-of-truth map that **states the no-duplication rule explicitly** (e.g., "one canonical location per system; duplicates are marked non-authoritative or deleted"). +- Freshness rule naming **both verification cadence AND staleness signal** (e.g., "verify every 90 days; mark `stale` if last-verified > cadence or if linked alert/code changed without doc update"). +- Docs-as-code workflow: **traceable doc changes AND automated checks** (link-checker, markdown lint, CI build) running on every doc PR. +- Required docs for launch, operations, migration, or maintenance. +- Operational-doc freshness matrix for runbooks and dashboard metric definitions. +- Update triggers tied to code, operations, and release events. +- Stale-doc cleanup plan. +- Usability and findability checks. + +## Checks Before Moving On + +- `audience_job`: each critical doc names its reader and supported task. +- `doc_source`: responsibility path and source of truth are explicit. +- `quadrant_classification`: every doc in the inventory **table carries a visible quadrant label** (tutorial / how-to / reference / explanation); runbooks and decision records tagged separately as operational/architectural. Mixed-quadrant docs are split. +- `no_duplication_rule`: source-of-truth section states an explicit rule against duplication, not just "remove duplicates." +- `staleness_signal`: freshness policy names both a cadence and the signal that flips a doc to stale. +- `docs_as_code`: doc changes flow through linked changes AND automated checks (lint, link-check, or CI). +- `freshness_rule`: change trigger, lifecycle state, and archive rule exist. +- `delivery_link`: docs required for operation or launch are tied to delivery checks. +- `operational_doc_freshness`: runbooks and dashboard metric definitions have source of truth, freshness trigger, and last-verified signal. +- `usability_check`: someone can find and use the doc without tribal knowledge. + +## Red Flags - Stop And Rework + +- Two docs contradict each other and neither is marked authoritative. +- A runbook has no source of truth or last-verified signal. +- A launch depends on undocumented manual knowledge. +- Stale docs remain searchable after the system changes. +- Documentation standards focus on formatting while operational gaps remain. + +## Common Mistakes + +| Mistake | Correction | +| --- | --- | +| Writing before audience | Start with reader job and decision. | +| Keeping every page forever | Archive misleading docs aggressively. | +| Treating docs as separate from delivery | Add update triggers to code and release workflows. | +| Style as control | Govern responsibility, truth, freshness, and usability. | diff --git a/plugins/sirmarkz/staff-engineer-mode/specialists/edge-traffic-and-ddos-defense.md b/plugins/sirmarkz/staff-engineer-mode/specialists/edge-traffic-and-ddos-defense.md new file mode 100644 index 00000000..8942ddfd --- /dev/null +++ b/plugins/sirmarkz/staff-engineer-mode/specialists/edge-traffic-and-ddos-defense.md @@ -0,0 +1,126 @@ +--- +name: edge-traffic-and-ddos-defense +description: "Use when public-edge rate limits, bot controls, origin isolation, abuse traffic, or DDoS response need design" +--- + +# Edge Traffic And Denial-Of-Service Defense + +## Iron Law + +``` +NO PUBLIC EDGE EXPOSURE WITHOUT ORIGIN PROTECTION, RATE POLICY, TELEMETRY, AND RULE ROLLBACK +``` + +If attackers can bypass the edge and hit origin directly, edge defense is incomplete. + +## Overview + +Public traffic must be filtered and shaped before abusive load reaches expensive systems. + +**Core principle:** layer volumetric, protocol, application, identity, and origin protections with telemetry and reversible rules. + +## When To Use + +- The user asks about public edge traffic, denial-of-service risk, edge caching, application-layer filtering, bot defense, abuse throttling, origin protection, traffic steering, or edge load shedding. +- Public traffic spikes, abusive clients, or bots threaten availability or cost. +- A service needs rate limits or request filtering before work reaches application dependencies. +- The user asks how to protect origins or global entry points. + +## When Not To Use + +- The issue is internal service retry/backpressure; use `dependency-resilience` instead. +- The request is normal capacity growth without abusive traffic; use `performance-and-capacity` instead. +- The main topic is application authorization; use `secure-sdlc-and-threat-modeling` or `identity-and-secrets` instead. +- The work is internal service mesh/routing; use `internal-service-networking` instead. + +## Info To Gather + +- Current work phase, next decision, what is known, and assumptions where details are missing. +- Public endpoints, routes, origins, DNS/traffic steering, identity signals, and bypass paths. +- Traffic patterns, known attacks, request costs, tenant/customer priorities, and false-positive tolerance. +- Existing edge rules, rate limits, bot controls, challenges, allow/deny lists, and emergency controls. +- Origin capacity, dependency limits, caching behavior, and overload thresholds. +- Telemetry: rule ID, action, request ID, route, identity/tenant, status, latency, and origin result. +- Rule responsibility, rollout mode, dry-run capability, expiry, refresh cadence, and rollback path. + +## Workflow + +1. **Map the edge.** Identify public entry points, origins, bypass paths, and expensive downstream operations. +2. **Separate attack layers.** Distinguish volumetric, protocol, application-layer, credential-stuffing, scraping, and tenant-abuse patterns. +3. **Protect origin.** Restrict direct access, require edge-origin authentication where possible, and remove bypass routes. +4. **Shape traffic early.** Apply rate limits, quotas, challenges, caching, prioritization, and load shedding before expensive work. +5. **Specify rate rules.** For each protected route or route class, name the key, window, threshold, and breach action such as 429, deny, or challenge. +6. **Tune false positives.** Use dry-run or staged enforcement for new rules when possible; define false-positive signals. +7. **Instrument decisions.** Log rule, action, identity, route, request ID, and origin outcome. +8. **Plan emergency controls.** Predefine who can apply broad blocks, how long they last, and how they are checked. +9. **Expire rules.** Temporary mitigations need expiry, rollback, and post-event analysis. + +## Synthesized Default + +Use layered edge protection: origin isolation, traffic steering, caching where correct, rate limits, bot/abuse controls, DDoS response planning, edge telemetry, staged rule rollout, and reversible emergency mitigations. + + + +## Phase Behavior + +- Ideation: identify risks, defaults, unknowns, options, and the next decision before code exists. +- Design: shape the target artifact, tradeoffs, checks, and details to gather. +- Development: guide sequencing, code boundaries, checks, and acceptance criteria. +- Testing: define release-blocking tests, evals, fixtures, and failure probes. +- Release: define rollout, observability, abort, rollback, and readiness details. +- Maintenance: define owners, drift checks, cleanup triggers, and refresh cadence. +- Existing artifact: use current code, docs, telemetry, incidents, or diffs as context for the next engineering decision; do not wait for a finished artifact before guiding design, build, release, or operation. +- Missing details: state assumptions and say what to check next instead of blocking lifecycle guidance. + +## Exceptions + +- During active denial-of-service events, temporary broad blocking may be acceptable if checked and expired quickly. +- Internal-only services can use lighter public-edge controls if no public route exists. +- High-value customers or critical traffic may need priority lanes or separate rate policies. +- Some rules cannot run in dry-run mode; compensate with narrow scope and fast rollback. + +## Response Quality Bar + +- Lead with the edge risk, denial-of-service or abuse policy, origin-bypass fix, or emergency mitigation requested. +- Cover origin isolation, route cost, identity-aware limits, bot/abuse controls, false-positive handling, edge telemetry, staged enforcement, rollback, and expiry before optional edge breadth. +- Make recommendations actionable with rule scopes, thresholds, dry-run/enforce stages, rollback commands, verification windows, and emergency authority where relevant. +- Include a compact rate-rule table for public APIs: route or route class, identity key (IP/session/user/tenant/API key), window, threshold, breach action, rollout mode, user-confirmed exception, and rollback. +- Name the details to inspect, such as DNS/origin exposure, route inventory, request rates, tenant/user identity, rule logs, false-positive samples, origin saturation, and mitigation history; do not state details you have not seen. +- Stay vendor/product-agnostic, but DO name the standard edge primitives by category: rate-limit breach action (e.g., 429, deny, challenge), bot-detection mechanism (challenge, fingerprint, behavioral, reputation-based) with false-positive handling, origin-shielding mechanism (edge-IP allowlist, signed origin headers, private connectivity, mutual-authentication transport) with a verification step, and load-shedding criteria with priority preservation (e.g., shed unauthenticated/low-priority before authenticated critical). +- Stay inside edge traffic and DDoS defense. Route broader capacity or abuse-product policy only when they materially block defense decisions. +- Be concise: avoid generic DDoS background and prefer compact edge maps, rule tables, and runbooks. + +## Required Outputs + +- Edge architecture and origin-protection map. +- Denial-of-service, abuse, and rate-limit policy — include a per-route or per-route-class rate-limit table where every row names the identity key, window, threshold, and breach action (429/deny/challenge); each bot control names its mechanism AND false-positive handling; origin-shielding lists a mechanism AND a verification step; load-shedding states criteria AND which traffic is preserved by priority. +- Origin bypass remediation plan. +- False-positive review and rollout plan. +- Edge telemetry and alert requirements. +- Emergency mitigation runbook. +- Rule responsibility, expiry, and rollback plan. + +## Checks Before Moving On + +- `origin_check`: origins cannot be trivially bypassed from public networks. +- `rate_policy`: rate limits or abuse controls are tied to identity, route cost, and false-positive tolerance. +- `telemetry_check`: edge decisions include rule, action, route, identity/request context, and origin result. +- `rollback_check`: enforcement rules have rollout mode, and rollback path. +- `emergency_check`: broad mitigations have authority, expiry, and verification requirements. + +## Red Flags - Stop And Rework + +- Public clients can bypass edge controls and hit origin directly. +- Rate limits are global only and hurt good tenants before abusive traffic. +- Emergency block rules have no expiry. +- Edge logs cannot explain why a request was blocked. +- Rules are deployed broadly without rollback or user-confirmed exception. + +## Common Mistakes + +| Mistake | Correction | +| --- | --- | +| One giant block rule | Layer controls and scope them by route/identity/risk. | +| No origin isolation | Make bypass difficult or impossible. | +| Ignoring false positives | Use dry-run, staged enforcement, and review signals. | +| No edge telemetry | Log rule decisions and origin outcomes. | diff --git a/plugins/sirmarkz/staff-engineer-mode/specialists/engineering-control-evidence.md b/plugins/sirmarkz/staff-engineer-mode/specialists/engineering-control-evidence.md new file mode 100644 index 00000000..ab4956fd --- /dev/null +++ b/plugins/sirmarkz/staff-engineer-mode/specialists/engineering-control-evidence.md @@ -0,0 +1,130 @@ +--- +name: engineering-control-evidence +description: "Use when designing or collecting cross-surface engineering records, scorecards, exceptions, or control maps" +--- + +# Engineering Control Records + +## Iron Law + +``` +NO CROSS-SURFACE CONTROL MAP WITHOUT REPEATABLE RECORD SOURCE, CADENCE, EXCEPTION PATH, AND REFRESH TRIGGER PER CONTROL +``` + +If a control cannot be inspected against a maintained, repeatable engineering artifact on a defined cadence, it is not part of this skill. Single-surface records belong in the matching specialist skill. + +> This skill assumes a multi-surface engineering records request. The artifacts it produces (cross-surface control maps, engineering scorecards, exception registers) exist to coordinate records across different engineering surfaces. A solo developer can still use it when one project needs a combined record pack; single-domain records stay with the matching specialist skill. +> Even in a cross-project context, this skill aggregates locally available engineering records for the user. It does not wait for legal, manager, or external sign-off. + +## Overview + +Engineering controls are useful only when they are close to the work and produce records from engineering systems. + +**Core principle:** aggregate records from artifacts projects already create: diffs, tests, build attestations, deployment records, runbooks, incidents, access-change records, scans, and exceptions. + +## When To Use + +- The request explicitly spans two or more engineering surfaces and asks to design or collect one record pack, scorecard, control-to-artifact map, or exception register. +- You need one normalized record inventory across SDLC, reliability, supply chain, access, vulnerability, observability, data, or operations because separate engineering surfaces otherwise duplicate tracking. +- The user asks how to show engineering standards are followed across delivery and operations using artifacts from normal engineering work. +- Cross-surface engineering exceptions need expiry, compensating controls, residual risk, and revisit triggers in one register. +- A multi-surface engineering record pack is required and no single specialist covers the full surface set. + +## When Not To Use + +- The request is single-launch, single-traffic-shift, or tier-change readiness; use `production-readiness-review`. +- A single specialist covers the needed records directly: deployed vulnerability details belong to `vulnerability-management`; build-path provenance belongs to `software-supply-chain-security`; identity, secrets, and access details belong to `identity-and-secrets`; reliability target details belong to `slo-and-error-budgets`; alert and telemetry details belong to `observability-and-alerting`; backup and restore test results belong to `backup-and-recovery`; tenant boundary checks belong to `tenant-isolation`; data lifecycle details belong to `privacy-and-data-lifecycle`; data pipeline details belong to `data-pipeline-reliability`; threat-model details belong to `secure-sdlc-and-threat-modeling`; AI-assisted change verification belongs to `ai-coding-governance`. +- The user asks for records but actually wants a single-domain answer; use the matching specialist above. +- The request is broad compliance, legal, procurement, vendor risk, auditor-liaison program management, or business program management outside engineering lifecycle and operations. + +## Info To Gather + +- Current work phase, next decision, what is known, and assumptions where details are missing. +- Engineering standards, systems in scope, delivery decisions, and who needs the records. +- Existing artifacts: PRs, CI logs, tests, scans, build records, deployments, runbooks, incidents, access reviews, and dashboards. +- Refresh cadence, exception rules, and who can accept the risk. +- Current scorecards, manual collection burden, gaps, incidents, and recurring findings. +- Required engineering expectations or internal guidelines and how they map to engineering behavior. +- Broad practice or checklist inputs that need translation into concrete engineering behavior and one owning specialist per item. + +## Workflow + +1. **Check cross-surface scope.** Confirm the work spans at least two specialist engineering surfaces and that no single specialist covers the full record set. If the request is single-launch readiness, use `production-readiness-review`. If the request is single-domain records, use the matching specialist and stop. +2. **Map expectations to behavior.** Express each expectation as something engineers do, prevent, detect, confirm, test, or verify. +3. **Translate source material.** For broad practice inputs, rewrite each technical item into capability language, assign one owning specialist, and skip org-only or non-technical items. +4. **Locate records near engineering work.** Prefer generated records from changes, CI, deploys, access systems, scanners, runbooks, and incidents. +5. **Assign responsibility and cadence.** Every record source needs an owner, refresh cadence, and failure response. +6. **Define exceptions.** Require expiry, compensating control, refresh trigger, and risk-acceptance authority appropriate to severity. +7. **Build scorecards carefully.** Score capabilities and record state, not vanity metrics. Normalize overlapping security, reliability, supply-chain, operations, and internal engineering expectations into one record map. +8. **Create standards backlog.** Record gaps from failed record pulls, expired exceptions, incidents, and recurring findings with severity, expected fix path, and target date. +9. **Feed findings back.** Use incidents, failed reviews, and recurring exceptions to update standards and platform defaults. + +## Synthesized Default + +Keep records close to engineering workflows and automate collection where possible. Use one expectation-to-record map across overlapping engineering standards, benchmarks, and internal checklists so projects do not maintain duplicate tracking or conflicting interpretations. + + + +## Phase Behavior + +- Ideation: identify risks, defaults, unknowns, options, and the next decision before code exists. +- Design: shape the target artifact, tradeoffs, checks, and details to gather. +- Development: guide sequencing, code boundaries, checks, and acceptance criteria. +- Testing: define release-blocking tests, evals, fixtures, and failure probes. +- Release: define rollout, observability, abort, rollback, and readiness details. +- Maintenance: define owners, drift checks, cleanup triggers, and refresh cadence. +- Existing artifact: use current code, docs, telemetry, incidents, or diffs as context for the next engineering decision; do not wait for a finished artifact before guiding design, build, release, or operation. +- Missing details: state assumptions and say what to check next instead of blocking lifecycle guidance. + +## Exceptions + +- Single-surface records should be produced by the specialist skill, with this skill only aggregating if needed. +- Manual records can be temporary when automation is not yet available, but they need expiry and a replacement path. +- Legal/auditor-facing interpretation is out of scope; this skill produces engineering records, not legal conclusions. +- Threat-detection mapping is included only when detection coverage is explicitly in scope. + +## Response Quality Bar + +- Lead with the expectation-to-record map, scorecard, exception register, or record pack outline requested. +- Cover engineering behavior, repeatable record sources, cadence, pass/fail states, exceptions, and workflow fit before optional program breadth. +- Make recommendations actionable with artifact sources, collection cadence, failure response, automation backlog, and exception expiry where relevant. +- Name the details to inspect, such as CI results, deploy records, configuration snapshots, change records, incident records, control outputs, and source artifact links; do not state details you have not seen. +- Stay technology-agnostic by default: do not introduce provider, product, framework, database, protocol, or command names unless the user supplied them or explicitly requested tool-specific guidance. +- Stay inside engineering records. Do not make legal, procurement, staffing, or external-assurance statements. +- Be concise: avoid generic compliance language and prefer compact engineering record tables. + +## Required Outputs + +- Engineering expectation-to-behavior-to-record map. +- Translation map from broad practice items to one owning specialist, with non-technical items marked skipped. +- Record inventory with source, cadence, and retention. +- Scorecard with pass/fail/exception states. +- Exception register with expiry, compensating controls, refresh trigger, residual risk, and acceptance authority. +- Record pack outline linked to source artifacts. +- Standards update backlog with gap source, engineering expectation, severity, expected fix path, and target date. + +## Checks Before Moving On + +- `scope_check`: request explicitly spans two or more engineering surfaces, no single specialist covers the full record set, and non-engineering program management is excluded. +- `record_source`: every expectation maps to a repeatable engineering artifact source. +- `source_translation`: broad practice inputs are rewritten as concrete engineering behavior and assigned to one owning specialist, with org-only items skipped. +- `cadence_check`: every record source has refresh cadence and failure response. +- `exception_check`: exceptions have expiry, compensating control, and refresh trigger. +- `workflow_fit`: records are captured from normal engineering workflows where possible. + +## Red Flags - Stop And Rework + +- Expectations are copied from standards without mapping to engineering behavior. +- A key record is a screenshot someone must manually collect every quarter. +- Exceptions never expire. +- Scorecards reward document presence rather than control effectiveness. +- The skill is being used as lawyer, procurement owner, staffing owner, or compliance-program manager. + +## Common Mistakes + +| Mistake | Correction | +| --- | --- | +| Central record chores | Put records in the workflow that creates them. | +| Duplicate maps per standard | Normalize overlapping expectations into one record map. | +| Open-ended exceptions | Add expiry, compensating control, and refresh trigger. | +| Using this for everything | Prefer domain skills for single-surface records. | diff --git a/plugins/sirmarkz/staff-engineer-mode/specialists/event-workflows.md b/plugins/sirmarkz/staff-engineer-mode/specialists/event-workflows.md new file mode 100644 index 00000000..78a914ea --- /dev/null +++ b/plugins/sirmarkz/staff-engineer-mode/specialists/event-workflows.md @@ -0,0 +1,131 @@ +--- +name: event-workflows +description: "Use when events, messages, queues, streams, sagas, or workflows need idempotency, retry, DLQ, ordering, or replay" +--- + +# Event Driven Systems And Workflows + +## Iron Law + +``` +NO EVENT OR WORKFLOW WITHOUT CONTRACT, IDEMPOTENCY, RETRY, DLQ, AND REPLAY POLICY +``` + +If consumers cannot safely see a message twice or late, the workflow is not production-ready. + +## Overview + +Asynchronous systems trade call-time coupling for delivery, ordering, replay, and correction obligations. + +**Core principle:** assume duplicate, delayed, reordered, and replayed messages unless the design handles those cases explicitly. + +## When To Use + +- The user asks about events, queues, streams, change capture, transactional outbox, sagas, retries, DLQs, replay, message schemas, or workflow orchestration. +- A design replaces synchronous calls with asynchronous processing. +- A multi-step business process spans services or responsibility boundaries. +- The user asks how to publish state changes reliably. + +## When Not To Use + +- The design is only synchronous RPC or HTTP call policy; use `dependency-resilience` instead. +- The main question is storage consistency or transaction semantics; use `distributed-data-and-consistency` instead. +- The prompt centers a database or storage boundary where correctness can break; use `distributed-data-and-consistency` instead. +- The work is batch/warehouse freshness and lineage; use `data-pipeline-reliability` instead. +- The issue is cache invalidation only; use `caching-and-derived-data` instead. + +## Info To Gather + +- Current work phase, next decision, what is known, and assumptions where details are missing. +- Producers, consumers, topics/queues/streams, and event purpose. +- Event type: notification, state transfer, event-sourced fact, command, reply, or workflow step. +- Schema, compatibility rules, required fields, versioning, and responsibility. +- Delivery semantics, ordering needs, partition key, idempotency key, and dedupe window. +- Retry policy, backoff, max attempts, DLQ handling, poison message behavior, and manual repair. +- Queue bounds, age, depth, drain rate, consumer concurrency, and batched-message per-item status. +- Replay needs, retention, correction process, and consumer side effects. +- Backlog metrics, processing latency, freshness, consumer lag, and alert thresholds. + +## Workflow + +1. **Classify the pattern.** Distinguish notification, event-carried state, event sourcing, command, CQRS read model, saga, and workflow orchestration. +2. **Define the contract.** Write schema, meaning, responsibility, compatibility, and versioning rules before implementation. +3. **Publish atomically.** Use a durable local transaction plus outbox or equivalent when state change and message publication must agree. +4. **Make consumers idempotent.** Design dedupe, commutative updates, durable processing markers, or safe side effects. +5. **Control retries.** Bound attempts, add backoff/jitter, isolate poison messages, define DLQ responsibility, and retry only failed or unknown items in batched work when item status is available. +6. **Plan ordering and partitioning.** Order only where necessary; choose partition keys that avoid hot partitions and preserve required entity order. +7. **Design replay and correction.** Ensure reprocessing is safe, observable, and can repair bad events or bad consumers. +8. **Instrument the flow.** Track enqueue time, age, depth, lag, drain rate, processing errors, consumer concurrency, DLQ volume, batched-item status, and replay progress. + +## Synthesized Default + +Use at-least-once delivery with idempotent consumers as the default mental model. Use outbox or equivalent for atomic publish, sagas or workflow state for multi-step processes, schema compatibility for evolution, durable queueing for rate mismatch, and explicit replay/correction for recovery. Treat event sourcing as a high-complexity pattern, not a default persistence style. + + + +## Phase Behavior + +- Ideation: identify risks, defaults, unknowns, options, and the next decision before code exists. +- Design: shape the target artifact, tradeoffs, checks, and details to gather. +- Development: guide sequencing, code boundaries, checks, and acceptance criteria. +- Testing: define release-blocking tests, evals, fixtures, and failure probes. +- Release: define rollout, observability, abort, rollback, and readiness details. +- Maintenance: define owners, drift checks, cleanup triggers, and refresh cadence. +- Existing artifact: use current code, docs, telemetry, incidents, or diffs as context for the next engineering decision; do not wait for a finished artifact before guiding design, build, release, or operation. +- Missing details: state assumptions and say what to check next instead of blocking lifecycle guidance. + +## Exceptions + +- Broker-level exactly-once guarantees may reduce duplicates inside one boundary, but consumers still need duplicate-safe business outcomes. +- Ordering should be scoped to the smallest entity needed; global ordering is rarely worth the throughput and availability cost. +- Fire-and-forget notifications are acceptable only when loss, duplication, and delay are explicitly harmless. +- Human confirmation workflows may prefer explicit workflow state over event choreography. + +## Response Quality Bar + +- Lead with the workflow state model, failure handling plan, or blockers. +- Cover idempotency, ordering, retries, DLQ/poison handling, compensation, and reconciliation before optional event-system topics. +- Make recommendations actionable with checks, stop conditions, and replay controls where relevant. +- Name the details to inspect, such as event keys, retry counts, duplicate rates, DLQ age, consumer lag, and replay checks; do not state details you have not seen. +- Stay technology-agnostic by default: do not introduce provider, product, framework, database, protocol, or command names unless the user supplied them or explicitly requested tool-specific guidance. +- Stay inside the workflow and event contract. Route broad API or data consistency issues only when material. +- Be concise: avoid generic event-driven background and prefer compact state/retry/DLQ tables. + +## Required Outputs + +- Event/workflow contract and schema compatibility policy. +- Producer/consumer responsibility matrix. +- Idempotency and duplicate-handling plan. +- Retry, backoff, DLQ, and poison-message policy. +- Queue/workflow overload table covering depth, age, drain rate, consumer concurrency, poison path, and batched-item status. +- Ordering, partitioning, and hot-key plan. +- Replay, correction, and manual repair plan. +- Observability requirements for age, lag, depth, errors, and replay. + +## Checks Before Moving On + +- `contract_check`: event meaning, schema, and compatibility rules are documented. +- `idempotency_check`: every consumer side effect is duplicate-safe or explicitly non-retryable. +- `retry_dlq_check`: retry attempts, backoff, DLQ responsibility, and poison handling are defined. +- `queue_bound`: queue depth, age, drain rate, and consumer concurrency have bounds or explicit unknowns. +- `poison_path`: poison item handling, DLQ responsibility, and manual repair path are defined. +- `batch_item_status`: batched work records per-item success, failure, or unknown status before retry. +- `ordering_check`: ordering and partition key choices match the entity semantics. +- `replay_check`: replay/correction path is safe and observable. + +## Red Flags - Stop And Rework + +- Consumers assume exactly-once delivery without dedupe or idempotent side effects. +- DLQ exists but draining, replay, or correction has no runbook. +- Events are named after implementation steps rather than durable business facts. +- Schema changes have no compatibility rules. +- Replay would send emails, charge cards, or trigger irreversible actions again. + +## Common Mistakes + +| Mistake | Correction | +| --- | --- | +| Using events to hide coupling | Make responsibility and contract explicit. | +| Treating DLQ as storage | Define triage, replay, and discard policy. | +| Requiring global order | Order only per entity or workflow where needed. | +| Forgetting correction | Plan bad-event and bad-consumer repair from the start. | diff --git a/plugins/sirmarkz/staff-engineer-mode/specialists/experimentation-and-metric-guardrails.md b/plugins/sirmarkz/staff-engineer-mode/specialists/experimentation-and-metric-guardrails.md new file mode 100644 index 00000000..b3acee63 --- /dev/null +++ b/plugins/sirmarkz/staff-engineer-mode/specialists/experimentation-and-metric-guardrails.md @@ -0,0 +1,121 @@ +--- +name: experimentation-and-metric-guardrails +description: "Use when designing A/B tests, holdouts, ramps, or readouts needing decision metrics and guardrails" +--- + +# Experimentation And Metric Guardrails + +## Iron Law + +``` +NO EXPERIMENT CALL WITHOUT A HYPOTHESIS, A KNOWN EXPOSED POPULATION, GUARDRAIL METRICS, AND A PRE-COMMITTED READOUT RULE +``` + +The experiment must say what it predicts, record who actually saw the change (not just who was assigned), name the safety/quality metrics that can block a positive primary result, and commit to the decision rule before reading the result. For a small-project or hand-rolled experiment "known exposed population" can be as simple as "logged-in users on build SHA X after timestamp Y" - the invariant is that you can answer who was affected, not that you have an experimentation platform. + +## Overview + +Experiments are only useful when assignment, exposure, metrics, and decision rules are trustworthy. + +**Core principle:** design experiments with clear hypotheses, stable assignment, reliable exposure logging, predeclared metrics, guardrails, and invalidation checks. + +## When To Use + +- The user is designing, changing, running, or reading out an experiment, A/B test, holdout, or ramp decision and asks about sample-ratio mismatch, exposure logging, guardrail metrics, or metric trust. +- A product, ranking, pricing, UI, recommendation, or workflow change needs a causal readout rather than only rollout health. +- Experiment results conflict, look too good, lack power, or may be invalid because of logging, assignment, contamination, or metric defects. +- A ramp needs outcome guardrails beyond operational canary checks. + +## When Not To Use + +- The main question is blast radius, rollback, canary, or operational rollout; use `progressive-delivery` instead. +- The main question is service reliability objectives or alerting policy; use `slo-and-error-budgets` instead. +- The main question is LLM evals or model release checks; use `llm-evaluation` or `ml-reliability-and-evaluation` instead. +- The request is product strategy with no engineering measurement artifact. + +## Info To Gather + +- Current work phase, next decision, what is known, and assumptions where details are missing. +- Hypothesis, decision to make, target population, unit of assignment, treatment, control, and exposure rule. +- Primary metric, guardrail metrics, diagnostic metrics, minimum effect, runtime, and stopping rule. +- Assignment implementation, eligibility filters, ramp plan, holdout policy, and contamination risks. +- Exposure logging, event definitions, metric pipelines, missingness, delayed effects, and data-quality checks. +- Segment/slice plan, interaction with other experiments, and decision point. + +## Workflow + +1. **State the decision.** Define the hypothesis and what action the readout will drive. +2. **Choose assignment unit.** Pick a stable unit that matches the effect being measured and avoids cross-contamination. +3. **Define exposure.** Log when the user or entity could actually be affected, not only when assignment occurred. +4. **Predeclare metrics.** Name primary, guardrail, diagnostic, and segment metrics before reading results. +5. **Check validity.** Test assignment balance, sample-ratio mismatch, missing telemetry, logging defects, and eligibility drift. +6. **Plan interactions.** Identify overlapping experiments, long-lived holdouts, novelty effects, and downstream metric coupling. +7. **Check ramps.** Combine experiment outcomes with operational guardrails; do not let positive primary metrics hide safety regressions. +8. **Record the decision.** Capture result, caveats, decision, rollback trigger, and follow-up measurement. + +## Synthesized Default + +Use predeclared hypotheses, stable assignment, exposure-based analysis, primary and guardrail metrics, validity checks, segment readouts, and decision records. Treat metric trust failures as experiment blockers, not as minor caveats after the decision. + + + +## Phase Behavior + +- Ideation: identify risks, defaults, unknowns, options, and the next decision before code exists. +- Design: shape the target artifact, tradeoffs, checks, and details to gather. +- Development: guide sequencing, code boundaries, checks, and acceptance criteria. +- Testing: define release-blocking tests, evals, fixtures, and failure probes. +- Release: define rollout, observability, abort, rollback, and readiness details. +- Maintenance: define owners, drift checks, cleanup triggers, and refresh cadence. +- Existing artifact: use current code, docs, telemetry, incidents, or diffs as context for the next engineering decision; do not wait for a finished artifact before guiding design, build, release, or operation. +- Missing details: state assumptions and say what to check next instead of blocking lifecycle guidance. + +## Exceptions + +- Very low-risk copy or layout tests may use simpler analysis if assignment, exposure, and guardrails remain clear. +- Sequential ramps can make decisions before full power when safety or user impact requires it, but must state the weaker inference. +- Long-term effects may need holdouts or delayed readouts before irreversible changes. + +## Response Quality Bar + +- Lead with the experiment design, validity finding, ramp decision, or metric guardrail requested. +- Cover hypothesis, assignment, exposure, metrics, guardrails, validity checks, slices, interactions, and decision rule before optional statistics detail. +- Make recommendations actionable with metric definitions, stop criteria, invalidation triggers, and readout dates where relevant. +- Name the details to inspect, such as assignment logs, exposure events, metric definitions, balance checks, missingness, segment results, and prior experiment interactions; do not state details you have not seen. +- Stay technology-agnostic by default: do not introduce provider, product, framework, database, protocol, or command names unless the user supplied them or explicitly requested tool-specific guidance. +- Stay inside experimentation and metric trust. Use rollout safety, service SLO, or AI eval skills only when those surfaces drive the decision. +- Be concise: prefer experiment design and readout tables over generic testing background. + +## Required Outputs + +- Experiment design with hypothesis, population, assignment unit, treatment, control, and exposure rule. +- Metric map: primary, guardrail, diagnostic, and segment metrics. +- Validity checks for assignment, sample ratio, telemetry, eligibility, contamination, and missingness. +- Ramp, stop, and readout decision rules. +- Interaction and holdout notes. +- Decision record with caveats and follow-up measurement. + +## Checks Before Moving On + +- `hypothesis_named`: experiment maps to a clear decision and expected effect. +- `assignment_valid`: unit, eligibility, and balance checks are defined. +- `exposure_logged`: exposure event records who could be affected. +- `guardrails_set`: safety and quality metrics can block a positive primary result. +- `validity_checked`: metric trust failures are checked before readout. + +## Red Flags - Stop And Rework + +- Assignment exists but exposure is not logged. +- Metrics are chosen after the result is known. +- Sample-ratio mismatch is ignored. +- A positive primary metric hides reliability, safety, or accessibility harm. +- The ramp continues after validity checks fail. + +## Common Mistakes + +| Mistake | Correction | +| --- | --- | +| Rollout health as causal answer | Use assignment, exposure, and readout rules. | +| Result-first metrics | Predeclare metrics and guardrails. | +| Ignoring invalidation | Treat balance and telemetry failures as blockers. | +| Average-only readouts | Check important slices and long-term effects. | diff --git a/plugins/sirmarkz/staff-engineer-mode/specialists/feature-flag-lifecycle.md b/plugins/sirmarkz/staff-engineer-mode/specialists/feature-flag-lifecycle.md new file mode 100644 index 00000000..b487ff72 --- /dev/null +++ b/plugins/sirmarkz/staff-engineer-mode/specialists/feature-flag-lifecycle.md @@ -0,0 +1,147 @@ +--- +name: feature-flag-lifecycle +description: "Use when feature flags need lifecycle decisions: expiry, orphan detection, debt scoring, cleanup, or removal" +--- + +# Feature Flag Lifecycle + +## Iron Law + +``` +EVERY LIVE FLAG HAS AN EXPIRY, SAFE FALLBACK, AND REMOVAL PLAN +``` + +A flag without all three is orphan debt. Orphan flags become dead branches, contradictory defaults, and stale kill switches that nobody dares pull during an incident. + +## Overview + +Produces a flag inventory with category and expiry per flag, an orphan report for flags whose features no longer exist, and a removal plan with rollback for each retiring flag. Refuses to count a feature as shipped while a flag still controls it. + +**Core principle:** every live flag is unfinished work. After a rollout completes, the flag, its branches, and its config rows are decision debt that compounds until someone explicitly removes them. + +## When To Use + +- The user is deciding how a feature flag should be created, categorized, expired, cleaned up, inventoried, retired, or sunset. +- The user asks to inventory existing flags, assess flag debt, or set removal checks. +- A rollout has completed and the flag that gated it is still live. +- An incident exposed a flag whose intended behavior has no current fallback or removal rule. +- You ask how to stop accumulating flag debt or how to set expiry policy per flag class. +- The agent is being asked to add a new flag and the existing flag inventory and removal pattern need to be checked first. +- A code search reveals branches gated by flags that were not declared in any registry or are not referenced from production config. + +## When Not To Use + +- A change is mid-rollout and the question is staging, exposure rings, canary metrics, stop criteria, or rollback; use `progressive-delivery`. +- A flag itself is being changed as a configuration value with safety implications; use `configuration-and-automation-safety`. +- Generic dead-code or dependency cleanup with no flag-specific gating; use `dependency-and-code-hygiene`. +- The flag is an A/B experiment treatment under active analysis; use `experimentation-and-metric-guardrails`. +- The change is an org-level rule for AI-assisted code that adds flags it never removes; use `ai-coding-governance`. +- The work is broad release readiness across multiple surfaces; use `production-readiness-review`. + +## Info To Gather + +- Current work phase, next decision, what is known, and assumptions where details are missing. +- Flag inventory source: code search, flag-service registry, config files, environment overrides, and any per-tenant or per-location overrides. +- Per-flag metadata: name, declaration site, default value, current production value per environment, last evaluation timestamp where available, and number of branches behind the flag. +- Stated category for each flag: release toggle, experiment, operational kill switch, or permission/entitlement. +- Responsibility path per flag, fallback path, and user decision point for removal. +- Expiry policy by category and whether the flag has exceeded it. +- Rollout state: was the flag's launch completed, partially shipped, abandoned, or still ramping. +- Failure behavior: local fallback/default value used if flag evaluation fails, the behavior selected during a flag-service outage, and whether that behavior is safe for production. +- Branch coverage: which code paths execute under each value, whether both branches still have callers, and whether any tests exercise both branches. +- Tenants, locations, cohorts, or accounts pinned to non-default values and the reason for each pin. +- Incident history involving the flag, including any time the kill-switch path was exercised. + +## Workflow + +1. **Build the inventory.** Reconcile flags discovered in code, in the flag service or config registry, and in environment overrides. A flag that exists in only one of those sources is the first orphan signal. +2. **Classify each flag.** Assign exactly one category: release toggle (turns a shipped feature on), experiment (assigns variants for measurement), operational kill switch (disables a path under load or failure), permission or entitlement (controls access by tenant, plan, or role). A flag that resists classification is itself a finding. +3. **Set expiry by category.** Release toggles default to short expiry tied to rollout completion. Experiment flags default to short expiry tied to readout date. Operational kill switches default to longer expiry but require rehearsal cadence. Permission flags may be long-lived but still need renewal decisions and a safe fallback. +4. **Check default-value safety.** Record the local default/fallback value for each flag and the behavior chosen if flag evaluation or the flag service is unavailable. The fallback should select the safest known production behavior, not an accidental SDK or config default. +5. **Check rollout completion.** For each release toggle, confirm the rollout finished, the chosen value is the production default everywhere, and no environment still pins the legacy value without a documented reason. +6. **Detect orphans.** Flag the following as orphans: declared in code but absent from the registry; present in registry but unreferenced in code; expiry exceeded with no removal action; both branches identical or one branch unreachable; not evaluated in production within a defined freshness window where evaluation telemetry exists. +7. **Map flag-driven branches.** For each retiring flag, list the call sites, the branch each value selects, the tests that exercise each branch, and any config rows or per-tenant overrides that depend on the flag name. +8. **Plan removal.** For each flag scheduled for removal, define: target value (the branch that stays), the order of cleanup (default flip, override sweep, code removal, registry removal, config-row removal), the rollback path if removal regresses behavior, and the verification step that shows no caller still selects the removed branch. +9. **Stage the removal as a change.** Treat flag removal as a production change with separate blast radius and rollback. Use `progressive-delivery` as the internal lens when removal touches a tier-critical path. +10. **Score the flag debt.** Produce a scorecard: total flags by category, percent past expiry, percent without orphan count, oldest live flag age, and removal velocity over the last review period. +11. **Set the standing rule.** Establish per-category expiry defaults, a recurring renewal cadence, and the rule that adding a new flag requires declaring its category, expiry, and safe fallback value at creation time. + +## Synthesized Default + +Treat flags as time-bounded. Release toggles expire when the rollout completes. Experiment flags expire when the readout is accepted. Operational kill switches and permission flags may live longer but still require recurring renewal decisions. Removal is a planned change, not a cleanup ticket. The inventory is the source of truth and is reconciled against code on a defined cadence. Every flag must also document its fallback/default value and what production behavior occurs if flag evaluation fails. + + + +## Phase Behavior + +- Ideation: identify risks, defaults, unknowns, options, and the next decision before code exists. +- Design: shape the target artifact, tradeoffs, checks, and details to gather. +- Development: guide sequencing, code boundaries, checks, and acceptance criteria. +- Testing: define release-blocking tests, evals, fixtures, and failure probes. +- Release: define rollout, observability, abort, rollback, and readiness details. +- Maintenance: define owners, drift checks, cleanup triggers, and refresh cadence. +- Existing artifact: use current code, docs, telemetry, incidents, or diffs as context for the next engineering decision; do not wait for a finished artifact before guiding design, build, release, or operation. +- Missing details: state assumptions and say what to check next instead of blocking lifecycle guidance. + +## Exceptions + +- Long-lived operational kill switches may exceed standard expiry if the disabled path is rehearsed on a recorded cadence. +- Permission or entitlement flags tied to billing, plan, or regulatory access may be effectively permanent; they are not orphans but still need renewal decisions, fallback behavior, and test results. +- A flag protecting an in-progress migration may stay past its initial expiry with a renewed expiry date and completion condition. +- Emergency kill switches added during an incident may bypass the create-time expiry rule but must be classified, dated, and assigned a safe fallback value within the postmortem follow-up. + +## Response Quality Bar + +- Lead with the flag inventory, orphan list, removal plan, or flag-debt scorecard requested. +- Cover classification, responsibility, expiry, default-value safety, branch mapping, removal sequencing, and rollback before optional flag-system breadth. +- Make recommendations actionable with per-flag expiry, target value, fallback/default value, outage behavior, removal step, rollback step, and verification results. +- Name the details to inspect, such as code-search results, flag-registry export, environment overrides, evaluation telemetry where available, and incident history; do not state flag state from prose alone. +- Stay technology-agnostic by default: do not introduce provider, product, framework, database, protocol, or command names unless the user supplied them or explicitly requested tool-specific guidance. +- Stay inside post-rollout flag lifecycle. Route in-flight rollout sequencing, generic dead-code cleanup, experiment analysis, and config-change safety to the responsible specialist. +- Be concise: prefer compact inventory and removal tables over running narrative about flag philosophy. + +## Required Outputs + +- Flag inventory with name, category, declaration site, expiry, current production value per environment, fallback/default value if evaluation fails, outage behavior, and branch count. +- Orphan report listing flags with missing classification, exceeded expiry, unsafe or undocumented fallback, identical branches, unreachable branch, registry/code mismatch, or stale evaluation. +- Per-flag removal plan with target value, cleanup order, rollback path, and verification step for each flag scheduled for removal. +- Per-tenant, per-location, or per-cohort override list with reason and removal condition for each non-default pin. +- Branch map per retiring flag covering call sites, tests per branch, and dependent config rows. +- Flag-debt scorecard with totals by category, percent past expiry, percent without orphan count, oldest live flag age, and removal velocity. +- Standing rule: per-category expiry defaults, renewal cadence, and the create-time expiry/category/safe-fallback rule. +- Follow-up routes to progressive delivery, configuration safety, dependency hygiene, or experimentation as needed. + +## Checks Before Moving On + +- `flag_inventory_present`: a single inventory reconciles flags found in code, in the registry, and in environment overrides; mismatches are listed. +- `category_assigned`: every live flag has exactly one category from release, experiment, operational kill switch, or permission. +- `expiry_and_fallback`: every live flag has a dated expiry and safe fallback; any exception is recorded with renewed date and reason. +- `default_value_safety`: every live flag records the fallback/default value used when evaluation fails and the production behavior during a flag-service outage. +- `orphan_report`: orphan criteria are evaluated and the resulting flags are listed with the matching criterion per flag. +- `removal_plan_per_retiring_flag`: each flag scheduled for removal has target value, cleanup order, rollback path, and verification step. +- `branch_map`: retiring flags have a call-site list and a per-branch test list; unreachable or untested branches are flagged. +- `debt_scorecard`: scorecard covers totals by category, percent past expiry, percent without orphan count, oldest live flag age, and removal velocity. + +## Red Flags - Stop And Rework + +- A flag has no recorded expiry and no safe fallback, and you treat this as normal. +- A flag has no documented fallback/default value, so a flag-service outage could silently choose the wrong behavior. +- The rollout that created a flag completed months ago but the legacy branch still has callers and the flag is still evaluated in production. +- The flag registry and the code disagree about which flags exist, and reconciliation has no response path. +- An operational kill switch has never been exercised and no rehearsal exists, so its real behavior is unknown. +- Both branches of a flag are identical or one branch is unreachable, and the flag is still evaluated. +- A flag is removed by deleting code without sweeping per-tenant overrides, registry rows, or environment pins. +- New flags are being added by AI coding agents without recording category, expiry, or safe fallback at creation. + +## Common Mistakes + +| Mistake | Correction | +| --- | --- | +| Treating "the rollout finished" as cleanup | Removal is a separate planned change with rollback and verification. | +| One global flag bucket | Classify by release, experiment, operational, or permission; each has a different lifecycle. | +| Responsibility is vague | Record the user decision point and the exact removal trigger. | +| Counting flags only in code | Reconcile code, registry, and environment overrides; mismatches are orphans. | +| Ignoring flag-evaluation failure | Record the fallback/default value and confirm outage behavior is safe. | +| Removing the code path but leaving the flag | Sweep registry rows, overrides, and dependent config in the same change. | +| Letting kill switches drift untested | Rehearse the disabled path or downgrade the switch to documented inert. | +| Adding new flags faster than removing them | Track removal velocity in the scorecard and require declared expiry before new-flag creation. | diff --git a/plugins/sirmarkz/staff-engineer-mode/specialists/fleet-upgrades.md b/plugins/sirmarkz/staff-engineer-mode/specialists/fleet-upgrades.md new file mode 100644 index 00000000..3ade5375 --- /dev/null +++ b/plugins/sirmarkz/staff-engineer-mode/specialists/fleet-upgrades.md @@ -0,0 +1,135 @@ +--- +name: fleet-upgrades +description: "Use when runtime, platform, framework, client, service, or host upgrades span many versions or rollout waves" +--- + +# Fleet Upgrades And Version Skew Management + +## Iron Law + +``` +NO FLEET UPGRADE WITHOUT INVENTORY, SUPPORT WINDOW, SKEW POLICY, COMPATIBILITY TESTS, AND ROLLOUT CHECKS +``` + +If you cannot see what versions exist and what combinations are supported, the upgrade plan is guessing. + +## Overview + +Fleet upgrades are compatibility projects spread across runtimes, control planes, clients, services, and operators. + +**Core principle:** inventory support windows, define allowed skew, test mixed-version compatibility, stage rollout, and keep rollback or roll-forward paths ready. + +## When To Use + +- The user asks about fleet upgrades, runtime upgrades, platform upgrades, support windows, version skew, end-of-support, or mixed-version rollout. +- Many services, clients, jobs, workers, agents, nodes, or control-plane components must move over time. +- Old and new versions need to coexist safely during rollout. +- An upstream, community, or internal platform support deadline creates production risk. + +## When Not To Use + +- The work is a routine library update inside one repo; use `dependency-and-code-hygiene` instead. +- The main risk is build artifact reproducibility; use `release-build-reproducibility` instead. +- The main risk is exposed API compatibility; use `api-design-and-compatibility` instead. +- The main task is broad service retirement; use `migration-and-deprecation` instead. + +## Info To Gather + +- Current work phase, next decision, what is known, and assumptions where details are missing. +- Fleet inventory: components, versions, environments, criticality, support status, and local ownership info when available. +- Baseline runtime or platform drift, defects already fixed in newer baselines but still present in older ones, and exception expiry. +- Version-skew policy, compatibility matrix, upgrade order, and blocked combinations. +- Tests for mixed versions, client/server compatibility, data compatibility, and operational tooling. +- Rollout batches, maintenance windows, traffic exposure, rollback or roll-forward path, and freeze dates. +- Known deprecated features, removed behavior, config changes, and operator runbooks. +- Support-window communication plan: affected components or consumers, deadline, required consumer action, reminder cadence, and user-confirmed follow-up path. +- Exception list, expiry, risk, and compensating controls. + +## Workflow + +1. **Inventory the fleet.** List versions, support windows, baseline drift, criticality, local ownership info, and unknowns. +2. **Define allowed skew.** State which old/new combinations are supported during rollout and for how long. +3. **Communicate support deadlines.** Tell affected consumers when old versions leave support, what action they must take, and when reminders, follow-up, or enforcement start. +4. **Find breaking changes.** Check behavior, config, interfaces, data formats, tooling, and operational assumptions. +5. **Check compatibility.** Test mixed-version paths, upgrade order, downgrade or roll-forward behavior, and representative workloads. +6. **Batch rollout.** Move low-risk cohorts first, then critical paths with checks, user confirmation, and monitoring. +7. **Manage exceptions.** Track blockers with expiry, risk, compensating control, and the local details needed to close them. +8. **Update operations.** Refresh runbooks, alerts, dashboards, and local operating procedures for the new version. +9. **Close old paths.** Remove compatibility shims, stale versions, and exceptions after adoption is verified; keep baselines current enough that available fixes do not linger unnoticed. + +## Synthesized Default + +Use a support-window inventory, explicit version-skew policy, compatibility matrix, staged rollout, support-deadline communication plan, exception register, operational runbook update, and retirement check for old versions. Prefer proving mixed-version behavior before the first production batch. + + + +## Phase Behavior + +- Ideation: identify risks, defaults, unknowns, options, and the next decision before code exists. +- Design: shape the target artifact, tradeoffs, checks, and details to gather. +- Development: guide sequencing, code boundaries, checks, and acceptance criteria. +- Testing: define release-blocking tests, evals, fixtures, and failure probes. +- Release: define rollout, observability, abort, rollback, and readiness details. +- Maintenance: define owners, drift checks, cleanup triggers, and refresh cadence. +- Existing artifact: use current code, docs, telemetry, incidents, or diffs as context for the next engineering decision; do not wait for a finished artifact before guiding design, build, release, or operation. +- Missing details: state assumptions and say what to check next instead of blocking lifecycle guidance. + +## Exceptions + +- Emergency security upgrades may compress rollout stages, but still need compatibility risk decision, consumer notice if deadlines change, user confirmation, and rollback or roll-forward decision. +- Low-risk internal tools can use lighter checks if they are not production dependencies. +- Some upgrades cannot roll back safely; require stronger preflight tests and roll-forward criteria. + +## Response Quality Bar + +- Lead with the upgrade plan, skew decision, support-window risk, or blocker list requested. +- Cover inventory, support status, skew policy, compatibility tests, rollout batches, rollback or roll-forward, exceptions, consumer communication, and operations updates before optional detail. +- Make recommendations actionable with dates, checks, batch order, test results, consumer action requirements, and exception expiry where relevant. +- For end-of-support or support-window risk, include a short consumer communication timeline: announcement, deadline, required action from affected components or consumers, reminder cadence, and user-confirmed follow-up date. +- Name the details to inspect, such as version inventory, support deadlines, compatibility matrix, test output, rollout status, communication status, and runbook changes; do not state details you have not seen. +- Stay technology-agnostic by default: do not introduce provider, product, framework, database, protocol, or command names unless the user supplied them or explicitly requested tool-specific guidance. +- Stay inside fleet upgrade and version-skew management. Route dependency hygiene, API compatibility, or deprecation work only when that surface dominates. +- Be concise: prefer upgrade matrices and batch plans over broad migration prose. +- Emit upgrade order as a discrete labeled tier list before the time-phased rollout, not buried inside a weekly schedule. + +## Required Outputs + +- Fleet inventory with version, criticality, support status, and local ownership info when available. +- Baseline drift table showing current baseline, target baseline, available fixes not yet adopted, and exception expiry. +- Version-skew and compatibility matrix. +- Upgrade order as an explicit tier list (e.g., control plane → data plane / nodes → clients/operators), with one-line rationale per tier and the allowed skew range between tiers stated as a numeric window with breakage criteria. +- End-of-support / support-window communication plan with announcement date, final support date, affected consumers, required consumer action, reminder cadence, and follow-up or enforcement path. +- Rollout batches (waves) with progression criteria per wave. +- Mixed-version test plan and records requirements. +- Rollback or roll-forward plan stating both the procedure and the state-compatibility note (which prior state is restorable, which is not). +- Exception register with expiry, compensating control, and closure note. +- Operations update checklist. +- Old-version retirement check. + +## Checks Before Moving On + +- `inventory_complete`: supported, unsupported, unknown, and critical versions are visible. +- `baseline_freshness`: baseline drift, available fixes not yet adopted, and exception expiry are visible for maintained components. +- `skew_policy`: allowed mixed-version combinations and duration are explicit. +- `support_comms`: affected consumers, support deadline, required action, reminder cadence, and follow-up date are visible. +- `compatibility_test`: representative old/new paths are tested before broad rollout. +- `rollout_responsibility`: every batch has user confirmation, check, and halt criteria. +- `exception_expiry`: blocked components have risk, compensating control, expiry, and closure note. + +## Red Flags - Stop And Rework + +- The fleet inventory is based on guesses or stale spreadsheets. +- Old and new versions are assumed compatible without tests. +- Upgrade order ignores clients, jobs, agents, or operational tooling. +- Unsupported versions have no exception expiry, compensating control, or consumer communication deadline. +- Rollback is impossible but roll-forward criteria are not defined. + +## Common Mistakes + +| Mistake | Correction | +| --- | --- | +| Version bump as task | Treat it as a compatibility and rollout project. | +| No skew policy | Define supported old/new combinations. | +| Silent support deadline | Announce the deadline, required consumer action, reminders, and enforcement path. | +| Ignoring operators | Update runbooks, alerts, and tooling. | +| Leaving old versions | Add retirement checks and cleanup. | diff --git a/plugins/sirmarkz/staff-engineer-mode/specialists/high-availability-design.md b/plugins/sirmarkz/staff-engineer-mode/specialists/high-availability-design.md new file mode 100644 index 00000000..643499b6 --- /dev/null +++ b/plugins/sirmarkz/staff-engineer-mode/specialists/high-availability-design.md @@ -0,0 +1,134 @@ +--- +name: high-availability-design +description: "Use when a system must survive loss of a location, deployment unit, partition, shard, tenant, or dependency" +--- + +# High Availability Design And Validation + +## Iron Law + +``` +NO HA DESIGN WITHOUT A FAULT DOMAIN, SURVIVABILITY TARGET, CAPACITY MODEL, AND TEST PLAN +``` + +"Multi-location", "multi-fault-domain", and "redundant" are labels. They are not enough by themselves. + +## Overview + +High availability is the ability to keep serving through expected failures without inventing new operations during the failure. Here, location means a fault-isolated placement boundary such as a site, facility, deployment footprint, or independently operated environment. A fault domain can fail independently; a deployment unit is an operational or rollout boundary; a partition is a data, tenant, or workload slice; a stamp is a repeatable isolated service footprint. + +**Core principle:** identify fault domains, bound blast radius, provision enough steady-state capacity, and validate the failure mode before relying on it. + +## When To Use + +- The user asks whether a system can survive location, deployment-unit, process, host, shard, tenant, or dependency loss. +- A design says it is active-active, active-passive, partitioned, shuffle-sharded, or multi-location. +- A launch or PRR needs HA details. +- The work changes topology, failover, load balancing, placement, or blast radius. + +## When Not To Use + +- The main question is per-call retries, timeouts, backpressure, or circuit breaking; use `dependency-resilience` instead. +- The main question is restoring corrupted or lost data; use `backup-and-recovery` instead. +- The main question is planning a chaos experiment, game day, or fault injection drill; use `resilience-experiments` instead. +- The work is only unit, integration, or CI testing. +- The request is about generic uptime targets; define SLOs first via `slo-and-error-budgets`. + +## Info To Gather + +- Current work phase, next decision, what is known, and assumptions where details are missing. +- Service tier, SLOs, critical user journeys, and maximum tolerable interruption. +- Current topology: hosts, deployment units, locations, partitions, shards, queues, load balancers, stores, and control planes. +- Fault domains: process, node, rack, location, deployment unit, administrative boundary, cluster, deployment ring, tenant, data partition, dependency, and operator action. +- Capacity by domain, peak traffic, failover headroom, and dependency quotas. +- Fault-domain independence, per-domain telemetry, data replication model, consistency needs, and any hidden global dependencies. +- Per-domain health signals, traffic shift path, and last validation result for moving traffic away from an unhealthy domain. +- Existing failover tests, incidents, game days, chaos experiments, and rollback procedures. + +## Workflow + +1. **State the survival target.** Use the form: "survive loss of X while continuing Y, with no manual Z, within SLO W." +2. **Draw the fault-domain map.** Include serving path, data path, control plane, deployment system, identity, config, DNS, observability, and operator access. +3. **Check fault-domain independence.** A serving path scoped to one location, partition, or deployment unit should not require synchronous calls to another independent fault domain or shared global state unless the exception, failure behavior, and customer impact are explicit. +4. **Check static stability and constant-work behavior.** Confirm remaining domains already have enough capacity and quotas during the failure; do not count emergency scaling that depends on the failed domain. Prefer designs where the system does the same work in failure as in success — pre-provisioned headroom over reactive scaling, hedged parallel requests over retry-on-timeout, scheduled credential pushes over fetch-on-demand, heartbeat-based health over dedicated failure probes. Failure-only code paths get little real exercise and are the most common source of latent failure-mode bugs. +5. **Choose topology deliberately.** Decide whether a single-location, location-redundant, multi-location, active-passive, active-active, stamp, or partition model is justified by the survival target. +6. **Bound blast radius.** Use partitions, stamps, shards, shuffle sharding, tenant isolation, or location boundaries when one failure could otherwise affect the whole fleet. Operational actions should not affect multiple independent fault domains at once unless the user explicitly accepts the emergency risk. +7. **Remove hidden coupling.** Find global locks, shared queues, shared caches, control-plane calls, cross-location synchronous writes, centrally coupled config, and externally hosted artifacts in the serving, deploy, scale, and startup paths. +8. **Define failover behavior.** Specify automatic/manual trigger, traffic drain or shift, data consistency, split-brain prevention, client behavior, rollback to normal, and when the shift path was last validated. +9. **Validate safely.** Define the validation objective, then route detailed fault-injection or game-day planning to resilience experiments when that is the main work. + +## Synthesized Default + +Use fault-domain independence, static stability, and explicit fault-domain isolation as the default. Prefer designs that continue in steady state after a domain loss over designs that require emergency scaling, global control-plane calls, or complex operator choreography. Add partitions, stamps, or shuffle sharding when tenant, shard, or workload blast radius is the real risk. + + + +## Phase Behavior + +- Ideation: identify risks, defaults, unknowns, options, and the next decision before code exists. +- Design: shape the target artifact, tradeoffs, checks, and details to gather. +- Development: guide sequencing, code boundaries, checks, and acceptance criteria. +- Testing: define release-blocking tests, evals, fixtures, and failure probes. +- Release: define rollout, observability, abort, rollback, and readiness details. +- Maintenance: define owners, drift checks, cleanup triggers, and refresh cadence. +- Existing artifact: use current code, docs, telemetry, incidents, or diffs as context for the next engineering decision; do not wait for a finished artifact before guiding design, build, release, or operation. +- Missing details: state assumptions and say what to check next instead of blocking lifecycle guidance. + +## Exceptions + +- Active-active multi-location is justified only when serving requirements exceed the complexity cost and data semantics can tolerate the replication model. +- Active-passive, warm standby, or pilot light may be better when RTO/RPO and operational maturity are the true constraints. +- Some internal or low-tier services can document a lower survival target if the SLO and user-confirmed risk decision accept it. +- Chaos experiments must be scoped down or simulated when blast radius cannot be ethically bounded. + +## Response Quality Bar + +- Lead with the availability decision, survivability target, fault-domain gap, or validation plan requested. +- Cover serving paths, fault domains, static capacity, blast radius, hidden dependencies, failover behavior, data semantics, and validation before optional HA breadth. +- Make recommendations actionable with survival targets, capacity calculations, trigger/authority rules, abort criteria, and validation results where relevant. +- Name the details to inspect, such as topology, traffic split, quotas, shared dependencies, failover drills, capacity under loss, replication behavior, and SLO/RTO/RPO targets; do not state details you have not seen. +- Stay technology-agnostic by default: do not introduce provider, product, framework, database, protocol, or command names unless the user supplied them or explicitly requested tool-specific guidance. +- Stay inside HA design and validation. Route backup/restore, chaos execution, or distributed consistency only when they are central to the decision. +- Be concise: avoid generic active-active discussion and prefer compact fault-domain maps and survivability tables. + +## Required Outputs + +- Fault-domain inventory and serving-path map. +- Survivability statement using "survive loss of X while continuing Y". +- Capacity and quota model under normal, peak, and failed-domain conditions. +- Fault-domain independence and hidden-coupling exception list. +- Blast-radius analysis and partition/shard/tenant isolation recommendation. +- Hidden dependency and control-plane risk list. +- Failover decision record with trigger, authority, data behavior, and rollback. +- Per-fault-domain health and traffic-shift table with signal, action path, expected capacity, and last validation result. +- Validation plan with scope, abort criteria, telemetry, and details to capture. + +## Checks Before Moving On + +- `fault_domain_map`: expected failure domains and hidden shared dependencies are enumerated. +- `location_independence`: serving, startup, deploy, scale, and recovery paths avoid synchronous cross-location or globally coupled dependencies, or document the exception and fallback. +- `static_capacity`: remaining domains can serve target traffic after the named failure without emergency scaling. +- `blast_radius_bound`: a single fault cannot exceed the documented partition, tenant, shard, or location impact boundary. +- `failover_behavior`: trigger, authority, data consistency, traffic behavior, and rollback are written down. +- `per_domain_signals`: each critical fault domain has health signals that can identify local impairment. +- `shift_path`: moving traffic away from an unhealthy domain has an automatic or manual path and a last validation result. +- `validation_plan`: failover, game day, or chaos test has scope, abort criteria, telemetry, and check path. + +## Red Flags - Stop And Rework + +- "We run in two deployment units" is treated as enough to show fault-domain resilience. +- Failover depends on humans discovering the issue and manually changing many systems under pressure. +- Remaining capacity after failure is assumed but not calculated. +- Critical serving calls depend synchronously on a global control plane, config service, or cross-location dependency. +- A deploy, scale-up, startup, or recovery path depends on artifacts or control planes unavailable during the named fault. +- One operational action can damage multiple locations, deployment units, partitions, or shards at once. +- Chaos testing is proposed without blast-radius limits or abort criteria. + +## Common Mistakes + +| Mistake | Correction | +| --- | --- | +| Confusing HA with DR | HA keeps serving through expected faults; DR restores after loss or corruption. | +| Counting autoscaling as static capacity | Model capacity already available when the domain fails. | +| Testing only the happy failover path | Test detection, partial failure, rollback, and return-to-normal. | +| Ignoring operator dependencies | Include identity, access, dashboards, deploy, and config systems in the map. | diff --git a/plugins/sirmarkz/staff-engineer-mode/specialists/identity-and-secrets.md b/plugins/sirmarkz/staff-engineer-mode/specialists/identity-and-secrets.md new file mode 100644 index 00000000..7c06eb21 --- /dev/null +++ b/plugins/sirmarkz/staff-engineer-mode/specialists/identity-and-secrets.md @@ -0,0 +1,128 @@ +--- +name: identity-and-secrets +description: "Use when designing human or workload access, scopes, credential lifetime, secret storage, or break-glass paths" +--- + +# Zero Trust Identity And Secrets + +## Iron Law + +``` +NO ACCESS PATH WITHOUT IDENTITY, AUTHORIZATION, CREDENTIAL LIFETIME, AUDIT, AND REVOCATION +``` + +If credentials cannot be scoped, rotated, audited, or revoked, they should not protect production access. + +## Overview + +Identity is the control plane for human and workload power. + +**Core principle:** authenticate explicitly, authorize least privilege, prefer short-lived credentials, isolate secrets, and audit high-risk actions. + +## When To Use + +- The user asks about identity, zero trust, service accounts, workload identity, federation, multi-factor access, secrets, keys, encryption, cryptography, or access control. +- A system grants human, service, admin, break-glass, tenant, or cross-environment access. +- Secrets appear in code, logs, CI, images, config, tickets, or operational workflows. +- A design needs key management, credential rotation, audit events, or least-privilege decisions. + +## When Not To Use + +- The request is general app threat modeling without identity/secrets focus; use `secure-sdlc-and-threat-modeling` instead. +- The main issue is artifact signing or build provenance; use `software-supply-chain-security` instead. +- The main issue is tenant data isolation; use tenant isolation. +- The request is staffing or access-program work without engineering implementation; out of scope. + +## Info To Gather + +- Current work phase, next decision, what is known, and assumptions where details are missing. +- Human identities, workload identities, roles, privileges, environments, tenants, and admin paths. +- Authentication factors, federation, session lifetime, device/context signals, and step-up requirements. +- Authorization model, permission granularity, default grants, just-in-time elevation, and break-glass process. +- Secrets, tokens, keys, certificates, storage locations, rotation cadence, expiry, and consumers. +- Service-call authentication coverage, credential expiry signals, rotation lead time, and response path when expiry approaches. +- Encryption needs, key responsibility, key separation, data classification, and long-lived confidentiality requirements. +- Activity logs, log retention, alerting, access recertification cadence, and revocation path. + +## Workflow + +1. **Inventory access paths.** Include humans, services, jobs, automation, support tools, emergency access, and third parties. +2. **Replace network trust.** Base access on identity, context, resource sensitivity, and explicit authorization, not location alone. +3. **Minimize privileges.** Scope permissions by action, resource, tenant, environment, and duration. +4. **Prefer workload identity and short-lived credentials.** Use managed identity where the runtime and resource share a trust domain; use workload identity federation when crossing platforms or organizations; use expiring tokens, rotation, revocation, and expiry signals over static long-lived secrets. +5. **Protect secrets and keys.** Keep them out of source, logs, images, and broad config; separate key administration from data access where risk warrants it. +6. **Design break-glass deliberately.** Require strong authentication, limited duration, justification, audit, and post-use verification. +7. **Use vetted cryptography.** Prefer standard protocols and managed primitives; do not invent algorithms or key-handling schemes. +8. **Audit and verify.** Emit access, privilege change, secret access, key use, and admin events with a user-visible verification path. + +## Synthesized Default + +Use zero-trust access with explicit identity, least privilege, workload identity for software, federation instead of copied secrets across trust boundaries, short-lived credentials, secure secret storage, strong human authentication, traceable break-glass, and vetted cryptography. Treat access as continuously verifiable, not granted once forever. + + + +## Phase Behavior + +- Ideation: identify risks, defaults, unknowns, options, and the next decision before code exists. +- Design: shape the target artifact, tradeoffs, checks, and details to gather. +- Development: guide sequencing, code boundaries, checks, and acceptance criteria. +- Testing: define release-blocking tests, evals, fixtures, and failure probes. +- Release: define rollout, observability, abort, rollback, and readiness details. +- Maintenance: define owners, drift checks, cleanup triggers, and refresh cadence. +- Existing artifact: use current code, docs, telemetry, incidents, or diffs as context for the next engineering decision; do not wait for a finished artifact before guiding design, build, release, or operation. +- Missing details: state assumptions and say what to check next instead of blocking lifecycle guidance. + +## Exceptions + +- Legacy systems may need compensating controls while static credentials are removed; require expiry, and rotation. +- Offline or embedded environments may require longer-lived secrets, but scope and revocation must be explicit. +- Very low-risk internal tools can use simpler access models if production data and privileged operations are absent. +- Long-lived confidentiality may require crypto-agility and post-quantum planning. + +## Response Quality Bar + +- Lead with the access model, secret-risk decision, migration plan, or blocker list requested. +- Cover identity boundaries, least privilege, credential lifetime, break-glass, audit, and cryptography before optional security breadth. +- Make recommendations actionable with permission scopes, rotation steps, audit events, stop criteria, and migration checks where relevant. +- Name the details to inspect, such as access inventories, service identities, secret locations, key rotation history, audit logs, and break-glass records; do not state details you have not seen. +- Stay technology-agnostic by default: do not introduce provider, product, framework, database, protocol, or command names unless the user supplied them or explicitly requested tool-specific guidance. +- Stay inside identity, secrets, and cryptography. Route privacy, supply-chain, or tenant isolation only when those are the central unresolved risk. +- Be concise: avoid generic zero-trust background and prefer compact access matrices, secret inventories, and migration checklists. + +## Required Outputs + +- Identity and access model. +- Human/service permission table with least-privilege decisions. +- Secret and key inventory with storage, rotation, expiry, and consumers. +- Authentication coverage and credential-expiry table with lead time, signal, owner path, and response. +- Break-glass and just-in-time access process. +- Audit event and access-recertification requirements. +- Cryptography decision record. +- Migration plan for overbroad or long-lived credentials. + +## Checks Before Moving On + +- `access_inventory`: human, workload, admin, emergency, and third-party access paths are listed. +- `least_privilege`: permissions are scoped by action/resource/environment/tenant and default-deny is addressed. +- `credential_lifetime`: secrets and tokens have storage, expiry, rotation, and revocation plan. +- `expiry_signal`: credentials, certificates, and tokens that can break production have expiry visibility and response lead time. +- `auth_coverage`: service calls have authentication coverage or an explicit exception with compensating controls. +- `activity_log_check`: high-risk access and privilege changes emit linked activity logs. +- `crypto_check`: cryptographic choices use vetted primitives and key responsibility is defined. + +## Red Flags - Stop And Rework + +- Production access depends on network location alone. +- Long-lived shared secrets have no rotation plan or check path. +- Break-glass access is permanent or unaudited. +- Secrets can appear in logs, build output, images, or client-visible config. +- Custom cryptography is proposed. + +## Common Mistakes + +| Mistake | Correction | +| --- | --- | +| Treating authentication as authorization | Authenticate identity, then authorize specific actions. | +| Sharing service accounts | Use workload identity or distinct scoped credentials. | +| Rotating without revocation | Define detection, revocation, and consumer restart/reload behavior. | +| Logging everything | Audit access without leaking secrets or sensitive data. | diff --git a/plugins/sirmarkz/staff-engineer-mode/specialists/incident-response-and-postmortems.md b/plugins/sirmarkz/staff-engineer-mode/specialists/incident-response-and-postmortems.md new file mode 100644 index 00000000..3023164f --- /dev/null +++ b/plugins/sirmarkz/staff-engineer-mode/specialists/incident-response-and-postmortems.md @@ -0,0 +1,140 @@ +--- +name: incident-response-and-postmortems +description: "Use when running active incidents, writing postmortems, or setting status cadence and action items" +--- + +# Incident Response And Postmortems + +## Iron Law + +``` +NO INCIDENT WITHOUT ROLES, IMPACT, AND STATUS CADENCE; NO POSTMORTEM WITHOUT TIMELINE, CONTRIBUTING FACTORS, AND VERIFIED ACTIONS +``` + +The two halves are co-designed: live response is unsafe without named responders, declared impact, and a predictable next-update time; a postmortem that only names a root cause or a person has not explained the system. For a solo developer the responder roles collapse onto one person, but the role labels still have to be explicit so nothing falls between them. + +## Overview + +Produces incident roles and severity, a live timeline, a status-update cadence, a checkpoint packet for shift changes, and a blameless postmortem whose action items have due dates, and observable verification signals. Refuses "human error" as a conclusion and refuses action items that read "be more careful". + +**Core principle:** coordinate clear roles, mitigate impact, preserve a timeline, communicate predictably, and convert learning into verified engineering improvements. + +## When To Use + +- The user asks for outage handling, incident command, severity, status updates, response roles, timelines, postmortems, or action items. +- A customer-impacting degradation, data issue, security event, or operational emergency is active or recently resolved. +- You need a blameless postmortem or follow-up tracker. +- An incident exposed gaps in alerts, runbooks, responsibility, deployment safety, or architecture standards. + +## When Not To Use + +- The work is pre-launch readiness with no incident; use `production-readiness-review` instead. +- The request is brand, PR, legal strategy, or customer-support policy beyond operational status communication. +- The user asks only to define telemetry; use `observability-and-alerting` instead. +- The user asks only to reduce alert fatigue; use `oncall-health` instead. + +## Info To Gather + +- Impact: affected users, journeys, severity, start/end times, data loss/corruption, and business-critical periods. +- Current state: active, mitigated, resolved, monitoring, or postmortem-only. +- Responders, roles, fallback path, user decision point, and communication channels. +- Available docs, dependency status, and user-provided contacts that can inform mitigation without blocking on an outside party. +- Timeline events: detection, triage, mitigation, customer communication, resolution, and recurrence. +- Mitigations attempted, signals observed, dashboards/logs/traces used, and changes during the window. +- Impact scoping by affected users or tenants, fault domain, dependency, and recent change markers. +- Contributing factors, missed signals, runbook gaps, responsibility gaps, and action-item candidates. + +## Workflow + +1. **During active impact, assign roles.** Use incident commander, operations lead, communications lead, and scribe when coordination requires them; for solo work, explicitly take each role yourself. +2. **Classify ticket severity.** Use impact radius and urgency: highest severity for widespread critical user or data/security impact, high severity for major but bounded customer impact, medium severity for limited degradation or internal dependency risk, and low severity for a low-impact anomaly requiring follow-up. +3. **Put live-site impact first.** Treat customer-visible availability, health, and security as the top priority until impact is controlled. +4. **Bound impact scope early.** Use user, tenant, fault-domain, dependency, and recent-change signals to bound impact safely. +5. **Mitigate before explaining.** Prefer actions that reduce user impact safely; postpone deep root-cause analysis until impact is controlled. +6. **Keep a live timeline.** Record timestamped facts, hypotheses, decisions, commands/actions, status updates, and responsibility changes. +7. **Communicate predictably.** Set status cadence by ticket severity; highest-severity incidents should update within 30 minutes or less, high-severity incidents within an hour, and lower severities by the user-confirmed cadence. Say what is known, unknown, impact, mitigation, and next update time. +8. **Change strategy when stuck.** Use the user, available documentation, dependency status, or a narrower diagnostic skill when impact persists, mitigation authority is unclear, or a latent risk is not getting traction. Do not wait for a vendor or outside group before taking the safest available mitigation. +9. **Checkpoint explicitly.** At every incident-commander or shift change, record state, current hypothesis, customer impact, in-flight actions, user decision point, comms cadence, and next decision point. +10. **Use the normal hotfix path where possible.** Reduce context switching by keeping artifact, branch, change, and rollout mechanics traceable even under urgency. +11. **Run security incidents as a protected track.** When confidentiality, integrity, identity, abuse, or data exposure may be involved, preserve logs and artifacts, restrict sensitive details to need-to-know responders, and keep operational facts separate from legal conclusions. +12. **Stabilize and verify.** Confirm recovery with user-visible metrics, not only internal health. +13. **Write a blameless postmortem.** Explain contributing factors across technical, operational, detection, change, and organizational layers. +14. **Replace single-root-cause wording with layered factors.** If the user supplies "root cause: X", treat X as one technical trigger, then add control, detection, rollout, responsibility, or organizational defenses that allowed impact; mark inferred factors as candidates to verify. +15. **Create verified actions.** Every action needs due date, observable completion signal, and classification: prevent, detect, mitigate, or learn. +16. **Feed standards.** Turn recurring classes into SLO, observability, safe-change, HA, dependency-resilience, or platform-improvement work. + +## Synthesized Default + +Use role-based incident command during response and blameless, contributing-factor postmortems after recovery. Prefer mitigation and clear communication over premature diagnosis. Treat security incidents as record-sensitive operational events, keep engineering accountable for live-site outcomes, and treat action items as engineering commitments with verification, not aspirations. + + + +## Phase Behavior + +- Ideation: identify risks, defaults, unknowns, options, and the next decision before code exists. +- Design: shape the target artifact, tradeoffs, checks, and details to gather. +- Development: guide sequencing, code boundaries, checks, and acceptance criteria. +- Testing: define release-blocking tests, evals, fixtures, and failure probes. +- Release: define rollout, observability, abort, rollback, and readiness details. +- Maintenance: define owners, drift checks, cleanup triggers, and refresh cadence. +- Existing artifact: use current code, docs, telemetry, incidents, or diffs as context for the next engineering decision; do not wait for a finished artifact before guiding design, build, release, or operation. +- Missing details: state assumptions and say what to check next instead of blocking lifecycle guidance. + +## Exceptions + +- Security, privacy, legal, or safety incidents may have confidentiality constraints; keep operating from verified facts and user-provided requirements. +- Very small internal incidents can use a lightweight postmortem if impact, timeline, and action tracking remain explicit. +- If an incident is ongoing, delay final postmortem conclusions and keep outputs focused on response. +- Customer-facing wording may need user confirmation, but operational status cadence and facts remain in scope. + +## Response Quality Bar + +- Lead with the incident command plan, current mitigation posture, timeline, postmortem finding, or action register requested. +- Cover impact, severity, roles, timeline, communications cadence, mitigation, contributing factors, missed defenses, and verified actions before optional incident mechanics. +- For postmortems, include a **Contributing Factors** section with at least three factors across at least two layers such as technical trigger, detection gap, rollout/control gap, responsibility/runbook gap, or organizational tradeoff; avoid presenting one root cause as the whole explanation. +- Make recommendations actionable with user decision point, timestamps, next-update times, verification conditions, due dates, and follow-up checks where relevant. +- Name the details to inspect, such as alerts, dashboards, logs, deploy markers, chat timeline, customer-impact data, mitigation commands, and action verification; do not state details you have not seen. +- Stay technology-agnostic by default: do not introduce provider, product, framework, database, protocol, or command names unless the user supplied them or explicitly requested tool-specific guidance. +- Stay inside incident response and postmortems. Use security/privacy constraints or specialist reliability checks only when they are central to the next action. +- Be concise: avoid generic blameless-postmortem theory and prefer compact timelines, status updates, and action tables. + +## Required Outputs + +- Incident role assignment and severity classification. +- Live or reconstructed timeline. +- Impact summary with detection, mitigation, and resolution times. +- Impact-scope table by user group or tenant, fault domain, dependency, and recent change marker where available. +- Communications cadence and status-update skeleton. +- User-confirmed strategy-change trigger when mitigation stalls. +- Checkpoint packet for long incidents or responder changes. +- Postmortem with layered contributing factors and missed defenses, not only a root-cause statement. +- Action-item register with due date, observable verification signal, and category. +- Follow-up engineering checks for the relevant skill surfaces. + +## Checks Before Moving On + +- `impact_check`: user impact, severity, start/end or current state, and affected journeys are stated. +- `impact_scoping`: affected users or tenants, fault domains, dependencies, and recent changes are scoped or marked unknown. +- `role_check`: response roles and user decision point are assigned or explicitly not needed. +- `timeline_check`: detection, triage, mitigation, communication, resolution, and key decisions are captured. +- `checkpoint_check`: long incidents or role changes include state, in-flight actions, comms cadence, and next decision point. +- `blameless_check`: postmortem focuses on system factors and avoids person-blame or single-root-cause simplification. +- `action_check`: every action has due date, verification condition, and category. + +## Red Flags - Stop And Rework + +- The postmortem concludes "human error" without explaining system conditions. +- Timeline is reconstructed from memory with no timestamps or source records. +- Action items say "be more careful", "monitor better", or "improve tests" without verification. +- Status updates have no next-update time. +- Responders keep investigating without changing mitigation strategy when mitigation is stalled or authority is unclear. +- Mitigation is delayed because responders are debating root cause during active impact. + +## Common Mistakes + +| Mistake | Correction | +| --- | --- | +| Root-cause hunting during impact | Mitigate first, analyze after stabilization. | +| One action per symptom | Group by contributing factor and defense gap. | +| Blameless means consequence-free | Focus accountability on system improvements and verified actions. | +| Postmortem as ritual | Feed findings into standards, platform, and reliability backlog. | diff --git a/plugins/sirmarkz/staff-engineer-mode/specialists/infrastructure-and-policy-as-code.md b/plugins/sirmarkz/staff-engineer-mode/specialists/infrastructure-and-policy-as-code.md new file mode 100644 index 00000000..5c60b77a --- /dev/null +++ b/plugins/sirmarkz/staff-engineer-mode/specialists/infrastructure-and-policy-as-code.md @@ -0,0 +1,134 @@ +--- +name: infrastructure-and-policy-as-code +description: "Use when infrastructure needs declarative desired state, policy checks, drift detection, or environment promotion" +--- + +# Infrastructure Desired State And Policy As Code + +## Iron Law + +``` +NO INFRASTRUCTURE CHANGE WITHOUT VERSIONED DESIRED STATE, POLICY CHECKS, DRIFT RESPONSE, AND RECOVERY PLAN +``` + +If production infrastructure can change outside versioned desired state, policy checks, drift response, and a recovery plan, the platform is not controlled. + +## Overview + +Infrastructure is safer when desired state, policy checks, drift handling, and rollback are explicit. + +**Core principle:** make infrastructure changes declarative, enforceable, traceable, and continuously reconciled. + +## When To Use + +- The user is designing or changing infrastructure as code, declarative delivery, policy as code, deployment admission, drift detection, environment promotion, or infrastructure rollback. +- A platform needs enforceable standards for deployment, networking, identity, secrets, tagging, or runtime configuration. +- Manual infrastructure changes are causing drift, outages, or traceability gaps. +- The user needs to map platform policies into automated checks. + +## When Not To Use + +- The request is application business logic policy. +- The work is broad platform product design; use `platform-golden-paths` instead. +- The main topic is artifact provenance or signing; use `software-supply-chain-security` instead. +- The request is one-off architecture without reusable infrastructure policy. + +## Info To Gather + +- Current work phase, next decision, what is known, and assumptions where details are missing. +- Infrastructure resources, environments, responsible change path, desired-state repositories, and change workflow. +- Infrastructure boundaries for independent fault domains, shared control-plane dependencies, drift detection, and emergency reconciliation path. +- Policy requirements: security, reliability, identity, network, secrets, tagging, cost, and operational standards. +- Deployment/admission points, promotion model, user confirmations, and emergency-change path. +- Drift sources, detection methods, reconciliation authority, and incident history. +- Rollback/roll-forward mechanisms, state storage, locks, and blast-radius controls. +- Secret material, secret references, diff redaction, and state-store protection requirements. + +## Workflow + +1. **Define desired state.** Identify which infrastructure and runtime config must be represented declaratively. +2. **Keep secrets out of desired-state diffs.** Store secret references, encrypted envelopes, or external secret bindings instead of plaintext; redact plans/diffs and fail the change if secret values appear in change artifacts. +3. **Make changes traceable in version control.** Require responsible change path, plans/diffs, checks, and user confirmations appropriate to risk. +4. **Encode and test policies.** Convert standards into automated rules with clear failure messages, fixture tests, historical dry runs where feasible, and an exception path. +5. **Separate platform and workload boundaries.** Make shared services, application environments, fault-domain boundaries, shared control-plane dependencies, and responsibility explicit so policy inheritance and exceptions are understandable. +6. **Enforce at the right point.** Use pre-merge, pre-deploy, admission, or continuous drift checks depending on risk and feasibility. +7. **Detect drift.** Compare actual state to desired state and decide whether to alert, reconcile, or open a ticket. +8. **Plan rollback.** State when rollback is possible, when roll-forward is safer, and how state is protected. +9. **Handle emergencies.** Permit manual break-glass only with separate emergency identity, traceability, maximum duration, automatic re-locking, reconciliation, and post-change check. +10. **Protect the source of truth.** Treat desired-state repositories, state stores, lock stores, and reconcilers as production control-plane dependencies with access control, backup, and recovery plans. +11. **Feed records.** Surface policy and drift records to scorecards and PRR where useful. + +## Synthesized Default + +Use declarative desired state, traceable changes, automated policy checks, clear platform/workload boundaries, drift detection, controlled reconciliation, and explicit emergency paths. Policies should be technology-agnostic standards expressed as enforceable rules. + + + +## Phase Behavior + +- Ideation: identify risks, defaults, unknowns, options, and the next decision before code exists. +- Design: shape the target artifact, tradeoffs, checks, and details to gather. +- Development: guide sequencing, code boundaries, checks, and acceptance criteria. +- Testing: define release-blocking tests, evals, fixtures, and failure probes. +- Release: define rollout, observability, abort, rollback, and readiness details. +- Maintenance: define owners, drift checks, cleanup triggers, and refresh cadence. +- Existing artifact: use current code, docs, telemetry, incidents, or diffs as context for the next engineering decision; do not wait for a finished artifact before guiding design, build, release, or operation. +- Missing details: state assumptions and say what to check next instead of blocking lifecycle guidance. + +## Exceptions + +- Some low-risk experiments can use temporary manual resources if isolated and expiry is enforced. +- Emergency changes may bypass normal change flow only with traceability and reconciliation. +- Not every policy should block immediately; advisory mode helps tune signal before enforcement. +- Roll-forward may be safer than rollback for stateful infrastructure; document the decision. + +## Response Quality Bar + +- Lead with the infrastructure workflow, policy decision, drift finding, or emergency-change procedure requested. +- Cover desired-state scope, traceability, plan/diff details, policy checks, enforcement mode, drift response, rollback or roll-forward, and emergency reconciliation before optional GitOps breadth. +- Make recommendations actionable with source-of-truth paths, policy rules, exception workflow, detection cadence, reconciliation steps, and checks where relevant. +- Name the details to inspect, such as repo paths, plans/diffs, user confirmations, policy outputs, drift reports, reconciliation logs, break-glass records, and deployment status; do not state details you have not seen. +- Stay technology-agnostic by default: do not introduce provider, product, framework, database, protocol, or command names unless the user supplied them or explicitly requested tool-specific guidance. +- Stay inside infrastructure workflow and policy-as-code. Route platform product work or supply-chain controls only when they are central to the decision. +- Be concise: avoid generic GitOps background and prefer compact workflow and control matrices. + +## Required Outputs + +- Infrastructure change workflow. +- Desired-state scope and responsibility. +- Fault-domain infrastructure boundary map, including shared state or control-plane dependencies that can defeat intended independence. +- Policy-as-code control matrix. +- Enforcement point and exception model. +- Drift detection and reconciliation plan. +- Rollback/roll-forward and emergency-change procedure. +- Secret-reference and diff-redaction guardrails. +- Desired-state and state-store protection plan. +- Links for change, policy, drift, and deployment records. + +## Checks Before Moving On + +- `desired_state`: managed infrastructure scope and source of truth are explicit. +- `change_record`: changes are linked to a plan/diff, responsible change path, and confirmation path. +- `secret_check`: desired state and change artifacts do not expose plaintext secrets. +- `policy_check`: policies map to engineering standards and enforcement/advisory mode. +- `drift_check`: drift detection and reconciliation response are defined. +- `infra_fault_boundary`: intended independent fault domains have separate configurable boundaries or an explicit shared-dependency exception. +- `emergency_check`: manual break-glass changes require separate identity, expiry, change history, reconciliation, and re-locking. + +## Red Flags - Stop And Rework + +- Production resources are changed manually and never reconciled. +- Policies block without clear error messages or exception path. +- Desired state is split across undocumented sources. +- Secret values appear in desired state, plan output, logs, or change diffs. +- Rollback assumes state can simply be reverted. +- Emergency changes leave permanent drift. + +## Common Mistakes + +| Mistake | Correction | +| --- | --- | +| Policy as prose | Encode enforceable or traceable checks. | +| Blocking too early | Tune in advisory mode, then enforce high-signal rules. | +| Ignoring drift | Define detection, reconciliation, and the change path. | +| No emergency path | Add traceable break-glass and post-change cleanup. | diff --git a/plugins/sirmarkz/staff-engineer-mode/specialists/internal-service-networking.md b/plugins/sirmarkz/staff-engineer-mode/specialists/internal-service-networking.md new file mode 100644 index 00000000..6e5224f1 --- /dev/null +++ b/plugins/sirmarkz/staff-engineer-mode/specialists/internal-service-networking.md @@ -0,0 +1,133 @@ +--- +name: internal-service-networking +description: "Use when designing internal service traffic needing discovery, routing, locality, identity, or private access" +--- + +# Internal Networking And Service Mesh + +## Iron Law + +``` +NO INTERNAL SERVICE PATH WITHOUT IDENTITY, FAILURE MODE, OBSERVABILITY, AND AN OPERATIONS PLAN FOR EVERY HOP +``` + +Every hop on a service-to-service path needs a workload identity, a documented failure mode, telemetry that explains what happened, and a runnable debugging and upgrade path. "We added a mesh" or "we use DNS" is not an answer to any of those four. For a solo or two-service deployment the rule still applies at a smaller scale. + +> This skill assumes a multi-service deployment. A single-process app does not have internal service hops; route to `dependency-resilience` for remote-call policy or `architecture-decisions` if the question is whether to split. + +## Overview + +Internal networking should solve concrete traffic, identity, policy, and observability problems; mesh is not a default. + +**Core principle:** choose the simplest internal networking model that provides required routing, identity, reliability, observability, and operations guarantees. + +## When To Use + +- The user is designing, changing, or troubleshooting internal service networking, service mesh, internal load balancing, service discovery, east-west traffic policy, authenticated service-to-service transport, locality-aware routing, or cross-location network cost. +- Services need consistent traffic policy, identity, telemetry, routing, or authorization at the platform layer. +- Internal routing or failover behavior affects reliability, latency, blast radius, or cost. +- The user asks whether adopting a service mesh is justified. +- The affected path is known to be internal service-to-service or private network traffic. + +## When Not To Use + +- The request is public edge abuse or denial-of-service defense; use `edge-traffic-and-ddos-defense` instead. +- The request is a vague network issue without a known affected path, surface, or symptom; use the router first. +- The issue is per-call retry/timeout/backpressure policy without networking architecture; use `dependency-resilience` instead. +- The main topic is API contract design; use `api-design-and-compatibility` instead. +- The work is broad identity/secrets beyond network identity; use `identity-and-secrets` instead. + +## Info To Gather + +- Current work phase, next decision, what is known, and assumptions where details are missing. +- Service topology, traffic flows, protocols, locations, fault domains, partitions, dependencies, and responsibility. +- Concrete problem: service identity, encrypted transport, authorization, traffic splitting, locality, failover, observability, policy, or debugging. +- Current service discovery, load balancing, DNS/routing, ingress/egress, and network boundaries. +- Traffic entry points, routing or load-balancing limits, connection/concurrency limits, queue limits, overflow behavior, and emergency adjustment path. +- Latency, cross-location egress, failure domains, retry behavior, and dependency resilience policies. +- Platform maturity: upgrade process, sidecar/proxy/data-plane operations, incident history, and local diagnostic path. +- Telemetry needs: route, upstream/downstream identity, locality, retries, connection errors, and request context. + +## Workflow + +1. **Name the problem.** Do not propose mesh until the repeated capability gap is explicit. +2. **Map traffic.** Identify internal routes, traffic entry points, dependencies, locations, failover paths, identity boundaries, policy points, and overflow behavior. +3. **Compare no-mesh alternatives.** Consider library, gateway, platform, or simple load-balancer capabilities before adding a mesh-wide data plane. +4. **Define routing policy.** Include locality, failover, traffic splitting, retries, timeouts, and circuit behavior responsibility. +5. **Define identity and policy.** State how workload identity, authenticated encrypted transport, authorization, and audit work. +6. **Model failure and upgrades.** Include proxy/control-plane failure, config error, upgrade rollout, and debug burden. +7. **Instrument paths.** Capture request IDs, route metadata, identity, upstream locality, retries, errors, latency, connection saturation, queue pressure, and overflow decisions. +8. **Plan adoption.** Roll out by service, partition, or environment; keep rollback and exception path. + +## Synthesized Default + +Do not add service mesh by default. Adopt a mesh or equivalent platform traffic layer only when repeated cross-service needs justify its operational cost: identity, encrypted transport, traffic policy, telemetry, authorization, routing, or locality. + + + +## Phase Behavior + +- Ideation: identify risks, defaults, unknowns, options, and the next decision before code exists. +- Design: shape the target artifact, tradeoffs, checks, and details to gather. +- Development: guide sequencing, code boundaries, checks, and acceptance criteria. +- Testing: define release-blocking tests, evals, fixtures, and failure probes. +- Release: define rollout, observability, abort, rollback, and readiness details. +- Maintenance: define owners, drift checks, cleanup triggers, and refresh cadence. +- Existing artifact: use current code, docs, telemetry, incidents, or diffs as context for the next engineering decision; do not wait for a finished artifact before guiding design, build, release, or operation. +- Missing details: state assumptions and say what to check next instead of blocking lifecycle guidance. + +## Exceptions + +- Small systems may use simple internal load balancing and library conventions. +- High-security or multi-tenant platforms may justify centralized identity and traffic policy earlier. +- Cross-location systems may prefer explicit location boundaries and locality rules over opaque global routing. +- Emergency network changes need audit, rollback, and post-change reconciliation. + +## Response Quality Bar + +- Lead with the mesh/no-mesh decision, routing policy, identity model, or failure-mode blocker requested. +- For quick design or troubleshooting answers, still include one compact per-edge baseline: ` -> ` discovery/routing mechanism and stale/unavailable behavior; service-to-service authentication mechanism and scope, such as mutual-authentication transport workload identity, mesh identity, or a signed service token for that edge; per-request authorization decision criteria, such as caller identity plus method/resource/action; default-deny service policy with user-confirmed exception rule; RED metrics (request rate, error rate, latency) with dashboard and alert; and runnable debug command or procedure. +- Cover concrete repeated needs, traffic map, routing/locality/failover, identity/encrypted transport/authorization, retry responsibility, telemetry, upgrades, rollback, and cost/latency tradeoffs before optional mesh breadth. +- Make recommendations actionable with policy locations, rollout stages, config checks, failure tests, rollback steps, and operational runbooks where relevant. +- Name the details to inspect, such as dependency maps, route config, retry/timeout settings, control-plane health, proxy versions, identity assertions, latency/egress data, and incident history; do not state details you have not seen. +- Stay technology-agnostic by default: do not introduce provider, product, framework, database, protocol, or command names unless the user supplied them or explicitly requested tool-specific guidance. +- Stay inside internal traffic and service mesh decisions. Route dependency resilience or zero-trust work only when it materially changes the mesh decision. +- Be concise: avoid generic mesh advocacy and prefer compact decision records and routing matrices. + +## Required Outputs + +- Internal traffic and dependency map. +- Mesh/no-mesh decision record with alternatives. +- Routing, locality, failover, and traffic-splitting policy. +- Traffic-path capacity table with entry point, routing limit, connection/concurrency limit, overflow behavior, and emergency adjustment path. +- Workload identity, encrypted transport, and authorization model. +- Operations, upgrade, diagnostics, and rollback plan. +- Network telemetry and debugging requirements. +- Cost and latency tradeoff notes for cross-boundary traffic. + +## Checks Before Moving On + +- `problem_check`: mesh or routing layer adoption maps to concrete repeated needs. +- `failure_model`: data-plane, control-plane, config, and upgrade failure modes are addressed. +- `diagnostic_check`: debugging, upgrade, and incident-response paths are explicit and runnable or marked unknown. +- `routing_policy`: locality, failover, traffic split, and retry/timeout responsibility are defined. +- `traffic_entry_capacity`: traffic entry points have capacity, connection/concurrency, and routing limits stated. +- `overflow_behavior`: overload, spillover, or reject behavior is defined and observable. +- `telemetry_check`: route, identity, locality, retry, latency, and error metadata are observable. + +## Red Flags - Stop And Rework + +- Mesh is selected because it is fashionable. +- Proxy upgrades or data-plane incidents have no runnable diagnostic or rollback path. +- Routing retries conflict with application retry budgets. +- Cross-location routing hides latency and egress cost. +- Identity is asserted but not tied to authorization or audit. + +## Common Mistakes + +| Mistake | Correction | +| --- | --- | +| Mesh first | Start with the capability gap and simpler options. | +| Hidden retries | Align network retries with application retry budgets. | +| No upgrade plan | Treat data-plane upgrades as production releases. | +| Blind global routing | Make locality, failover, and cost explicit. | diff --git a/plugins/sirmarkz/staff-engineer-mode/specialists/llm-application-security.md b/plugins/sirmarkz/staff-engineer-mode/specialists/llm-application-security.md new file mode 100644 index 00000000..6fe3a336 --- /dev/null +++ b/plugins/sirmarkz/staff-engineer-mode/specialists/llm-application-security.md @@ -0,0 +1,152 @@ +--- +name: llm-application-security +description: "Use when LLM prompts, retrieval, tools, model output, or generated actions cross security boundaries" +--- + +# LLM Application Security + +## Iron Law + +``` +NO LLM TOOL OR DATA ACCESS WITHOUT A BOUNDARY MAP, LEAST PRIVILEGE, ABUSE-CASE EVALS, AUDIT, AND OUTPUT HANDLING +``` + +If the model can cause an action, that action needs an explicit boundary, least-privilege scoping, abuse-case tests (prompt injection, exfiltration, jailbreak, unsafe action), an audit trail, and contextual handling of the output before any sink consumes it. "Evals" here are adversarial cases, not happy-path quality checks. + +## Overview + +LLM applications move untrusted text across tool, data, and decision boundaries. + +**Core principle:** treat prompts, retrieved content, tool outputs, and model responses as untrusted inputs; constrain what the application can do with them. + +## When To Use + +- The user is designing or building LLM prompt, retrieval, tool, output, action, or data flows that cross security boundaries. +- The user asks about prompt injection, tool permissions, retrieval boundaries, insecure output handling, sensitive prompt/response handling, agent actions, model/prompt/retrieval supply chain, emergency stop, or LLM eval security checks. +- An LLM can retrieve private data, call tools, write files, send messages, execute actions, or influence decisions. +- The system mixes instructions, user input, retrieved content, and tool output. +- A launch needs security tests for AI workflow behavior. + +## When Not To Use + +- The request is broad AI strategy, model strategy, or ethics work outside engineering controls. +- The work is classical ML evaluation or drift; use `ml-reliability-and-evaluation` instead. +- The request is general application threat modeling without LLM-specific boundaries; use `secure-sdlc-and-threat-modeling` instead. +- The issue is generic artifact provenance with no model/prompt/tool supply chain concern; use `software-supply-chain-security` instead. +- The main work is personal-data lifecycle, retention, deletion, export, or prompt/response storage controls; use `privacy-and-data-lifecycle` instead unless LLM prompt, retrieval, tool, or output boundaries dominate. +- The main work is tenant boundary enforcement outside LLM retrieval/session context; use `tenant-isolation` instead. +- The main work is generic source, build, artifact, or model provenance with no prompt, tool, retrieval, or dataset workflow boundary; use `software-supply-chain-security` instead. +- The main work is rollout, rollback, staged exposure, or release sequencing for a model-backed change; use `progressive-delivery` instead unless the emergency stop is an LLM-specific control gap. + +## Info To Gather + +- Current work phase, next decision, what is known, and assumptions where details are missing. +- LLM workflow, actors, prompts, system instructions, retrieved data, tools, actions, and output sinks. +- Trust boundaries among user input, developer instructions, retrieved documents, model output, tool results, and external systems. +- Data classification, tenant boundaries, permissions, secrets, and privacy constraints. +- Tool capabilities, scopes, rate limits, user confirmations, side effects, and activity logs. +- Input and output validation needs: length, file type, links, hidden/control characters, rendered content, feedback forms, and downstream consumers. +- Eval set: prompt injection, data exfiltration, unsafe actions, over-permission, output injection, and regression cases. +- Logging, redaction, prompt/response storage, retention, human access, and incident response expectations. +- Model chains, nested prompts, loop limits, retrieval fanout, token/cost ceilings, and downstream systems that consume model output. +- Model, prompt, retrieval corpus, index, dataset, and tool-policy provenance, rollback targets, and emergency disable paths. + +## Workflow + +1. **Map boundaries.** Identify every place untrusted text can influence prompts, retrieval, tool calls, code paths, messages, or stored state. +2. **Constrain tools.** Give tools minimum permissions, explicit schemas, rate limits, loop/depth limits, side-effect boundaries, and confirmation checks for high-risk actions. +3. **Protect retrieval.** Enforce tenant/data permissions before retrieval and again before answer/action use. +4. **Treat output as untrusted by sink.** Commands need allowlisted operations and dry-run/confirmation where risky; queries need parameterization and scoped credentials; rendered text needs contextual encoding; structured tool inputs need schema validation; documents/messages need destination policy checks; downstream prompts need boundary markers and instruction-isolation. +5. **Validate inputs and feedback.** Bound length and tokens, validate uploaded files by content and declared type, normalize or reject hidden/control characters, set an explicit link/URL policy, redact or block sensitive data by purpose, and apply the same controls to free-form feedback. +6. **Separate instructions from data.** Do not let retrieved or user content override developer/system policy. Use structural boundaries, markers, and deterministic checks as defense in depth, not as guarantees. +7. **Protect stored prompts and responses.** Classify prompts and outputs, minimize retention, restrict human access by need, encrypt with accountable key responsibility, and audit access. +8. **Protect session isolation.** Keep user sessions, conversation state, retrieved context, and mutable objects scoped per user/request; test race conditions that could leak history across users or tenants. +9. **Plan emergency stop and rollback.** Define independent disable or rollback paths for prompt templates, tool permissions, model/config, retrieval corpus, index, and training or fine-tuning inputs. +10. **Scope model chains.** When one model output feeds another model or agent, give each step separate permissions, retrieval boundary, audit trail, and injection eval. +11. **Evaluate adversarially.** Test prompt injection, tool misuse, data leakage, refusal bypass, unsafe output, dependency substitution, recursive tool loops, retrieval amplification, and regression cases with a repeatable adversarial corpus. +12. **Audit decisions.** Log prompts, retrieval identifiers, tool calls, user confirmations, denials, and outcomes with privacy-preserving redaction, retention, and replay-for-investigation rules. +13. **Control supply chain.** Track prompts, tools, models, datasets, retrieval corpora, indexes, and deployment artifacts as versioned inputs with version, source, eval result, integrity checks, rollback target, and retirement date. Treat executable or code-loading model artifacts as unsafe unless isolated, allowlisted, and justified. + +## Synthesized Default + +Use least-privilege tools, permission-checked retrieval, input validation, untrusted-output handling, sensitive-data controls, session isolation, adversarial eval checks, audit logs, emergency rollback, and versioned AI workflow inputs. Test the workflow against realistic attacker goals, then make deterministic application controls decide what is allowed. + + + +## Phase Behavior + +- Ideation: identify risks, defaults, unknowns, options, and the next decision before code exists. +- Design: shape the target artifact, tradeoffs, checks, and details to gather. +- Development: guide sequencing, code boundaries, checks, and acceptance criteria. +- Testing: define release-blocking tests, evals, fixtures, and failure probes. +- Release: define rollout, observability, abort, rollback, and readiness details. +- Maintenance: define owners, drift checks, cleanup triggers, and refresh cadence. +- Existing artifact: use current code, docs, telemetry, incidents, or diffs as context for the next engineering decision; do not wait for a finished artifact before guiding design, build, release, or operation. +- Missing details: state assumptions and say what to check next instead of blocking lifecycle guidance. + +## Exceptions + +- Read-only summarization with public data can use lighter tool controls, but output handling and injection tests still matter. +- Human confirmation can mitigate high-impact actions, but confirmation UI must show trustworthy context and not model-manipulated summaries only. +- Some logs must be minimized or redacted for privacy; keep enough traceability to investigate unsafe actions. +- Classifiers or model judges can help detect attacks, but they are defense in depth and must not be the only enforcement boundary. +- Broad AI strategy questions are out of scope unless tied to deployable engineering controls. + +## Response Quality Bar + +- Lead with the LLM threat model, tool-permission decision, eval check, or blocker list requested. +- For short design or pre-launch answers, include a compact release-check list: prompt-injection mitigation plus verification; tool inventory plus per-tool authorization; sink-specific output validation before execution, querying, rendering, messaging, or downstream prompting; sensitive-info controls plus monitoring; adversarial abuse cases with pass/fail criteria; and audit logs for model invocations, tool calls, denials, user confirmations, and retention. +- Cover prompt/retrieval/tool/output boundaries, least privilege, tenant/data isolation, input and output validation, unsafe-action controls, sensitive-data handling, adversarial evals, logging, emergency rollback, and supply-chain records before optional AI-security breadth. +- Make recommendations actionable with permission scopes, deterministic control points, eval cases, confirmation checks, stop criteria, and regression checks where relevant. +- Name the details to inspect, such as retrieval IDs, tool scopes, action sinks, prompt versions, model versions, eval results, audit logs, and redaction rules; do not state details you have not seen. +- Stay technology-agnostic by default: do not introduce provider, product, framework, database, protocol, or command names unless the user supplied them or explicitly requested tool-specific guidance. +- Stay inside deployable LLM application controls. Route privacy lifecycle, tenant isolation, rollout sequencing, and generic supply-chain trust away unless prompt, retrieval, tool, or output boundaries are the dominant risk. +- Be concise: avoid generic prompt-injection background and prefer compact boundary maps, permission matrices, and eval tables. + +## Required Outputs + +- LLM threat model and trust-boundary map. +- Prompt, retrieval, tool, and output permission matrix. +- Tool confirmation, rate-limit, and audit requirements. +- Retrieval data-boundary and tenant-isolation checks. +- Input, feedback, and output validation table for text, files, links, rendered content, and downstream sinks. +- Output sink handling table for commands, queries, rendered content, structured tool inputs, documents/messages, and downstream prompts. +- Adversarial eval and regression plan. +- Prompt/response storage, access, retention, logging, and privacy requirements. +- Emergency stop and rollback plan for prompt, model/config, retrieval corpus/index, tool permissions, and training or fine-tuning inputs. +- Session isolation and cross-user leakage test plan. +- Model/prompt/tool/data supply-chain record with artifact ID, version, source, integrity checks, eval result, rollback target, and retire-by date. + +## Checks Before Moving On + +- `boundary_map`: prompt, user input, retrieved data, tool output, model output, and action sinks are mapped. +- `least_privilege`: tools and retrieval are scoped by user, tenant, action, and side effect. +- `input_validation`: prompt, feedback, file, link, hidden/control-character, and size/token controls are defined before model use. +- `output_handling`: model output is validated, encoded, or constrained before use in sensitive sinks. +- `adversarial_check`: prompt injection, leakage, unsafe-action, and regression tests exist. +- `sensitive_data_control`: prompt/response storage, redaction, retention, and human access rules are defined. +- `rollback_control`: prompt, model/config, retrieval, tool-permission, and training/fine-tuning inputs can be disabled or rolled back independently. +- `activity_log_check`: tool calls, user confirmations, retrieval IDs, and outcomes are linked without leaking sensitive data. + +## Red Flags - Stop And Rework + +- Retrieved documents can override system instructions. +- The model can call broad tools with production privileges. +- Model output is executed, queried, rendered, or sent without validation. +- User input can include unbounded content, hidden instructions, unchecked links, or unchecked files. +- Prompts and responses are stored broadly or exposed to human roles without purpose, retention, and access controls. +- Prompt templates, retrieval corpora, indexes, tool permissions, or model configuration cannot be rolled back independently. +- Shared conversation or retrieval state can leak between users, tenants, or requests. +- Eval set only tests happy-path answer quality. +- Logs capture secrets or private prompts without redaction. + +## Common Mistakes + +| Mistake | Correction | +| --- | --- | +| Trusting the model to follow policy | Enforce policy in deterministic application controls. | +| Permission-checking after retrieval only | Check before retrieval and before action/use. | +| Treating prompts as config only | Version prompts as behavior-changing artifacts with release checks. | +| Treating guardrails as guarantees | Combine model-facing mitigations with deterministic application enforcement. | +| Ignoring prompt storage | Prompts and responses need classification, retention, access, and audit controls. | +| Evaluating model, not workflow | Test tool use, retrieval, output sinks, and confirmation paths. | diff --git a/plugins/sirmarkz/staff-engineer-mode/specialists/llm-evaluation.md b/plugins/sirmarkz/staff-engineer-mode/specialists/llm-evaluation.md new file mode 100644 index 00000000..c00579bf --- /dev/null +++ b/plugins/sirmarkz/staff-engineer-mode/specialists/llm-evaluation.md @@ -0,0 +1,123 @@ +--- +name: llm-evaluation +description: "Use when designing or changing model-backed evals needing datasets, graders, thresholds, slices, or triage" +--- + +# LLM Evaluation Harness Engineering + +## Iron Law + +``` +NO MODEL-BACKED CHANGE WITHOUT EVAL CASES, SCORING RULES, THRESHOLDS, REGRESSION HISTORY, AND FAILURE TRIAGE +``` + +If you cannot say what got better, what got worse, and which failures block release, the eval is not a release check. + +## Overview + +LLM behavior is production behavior when prompts, tools, retrieval, or model outputs affect users or workflows. + +**Core principle:** build eval harnesses with representative cases, stable scoring, slice coverage, regression history, and release thresholds before trusting model-backed changes. + +## When To Use + +- The user is designing, building, changing, or operating LLM evals, prompt tests, agent evals, graders, regression sets, acceptance thresholds, or model-backed workflow quality checks. +- A prompt, model, retrieval source, tool policy, or agent workflow changes and needs release checks. +- Existing evals are flaky, too small, too easy, judge-biased, or disconnected from production failures. +- You need repeatable checks for quality, safety, refusal, formatting, task completion, or user-impact slices. + +## When Not To Use + +- The main risk is prompt injection, tool misuse, data leakage, or unsafe actions; use `llm-application-security` instead. +- The main work is classical ML drift, training-serving skew, or model-serving readiness; use `ml-reliability-and-evaluation` instead. +- The request is broad AI coding-agent controls; use `ai-coding-governance` instead. +- The request is product strategy for which model to choose with no engineering check. + +## Info To Gather + +- Current work phase, next decision, what is known, and assumptions where details are missing. +- Workflow, user tasks, expected outputs, unacceptable failures, and release decision to support. +- Eval cases, production examples, synthetic cases, edge cases, slices, and known regressions. +- Scoring method, graders, rubrics, deterministic checks, human judgment, and tie-break rules. +- Thresholds, confidence needs, flake rate, baseline result, and comparison target. +- Versioned prompts, models, retrieval inputs, tools, datasets, and harness code. +- Failure triage workflow, severity, waiver rules, and re-run policy. + +## Workflow + +1. **Name the decision.** State whether the eval checks merge, release, prompt change, model change, or rollback. +2. **Build representative cases.** Include production-like tasks, edge cases, regressions, adversarial examples, and important user slices. +3. **Separate scoring types.** Use exact checks for structured requirements, rubric scoring for judgment, and human judgment for ambiguous high-impact cases. +4. **Control grader risk.** Define rubrics, blind comparisons where useful, calibration cases, and checks for scoring drift. +5. **Set thresholds first.** Declare pass, warn, and block criteria before looking at the new result. +6. **Version inputs.** Link prompts, model, retrieval corpus, tool policy, eval cases, graders, and harness code to the result. +7. **Triage failures.** Classify blockers, acceptable regressions, flaky cases, data issues, and missing coverage. +8. **Keep history.** Track baseline, deltas, regressions, waived failures, and production incidents that should become future cases. + +## Synthesized Default + +Use a versioned eval harness with representative cases, slice coverage, deterministic checks where possible, calibrated rubric graders where needed, predefined thresholds, regression history, and explicit failure triage. Treat aggregate score improvements as insufficient when critical slices or known failure modes regress. + + + +## Phase Behavior + +- Ideation: identify risks, defaults, unknowns, options, and the next decision before code exists. +- Design: shape the target artifact, tradeoffs, checks, and details to gather. +- Development: guide sequencing, code boundaries, checks, and acceptance criteria. +- Testing: define release-blocking tests, evals, fixtures, and failure probes. +- Release: define rollout, observability, abort, rollback, and readiness details. +- Maintenance: define owners, drift checks, cleanup triggers, and refresh cadence. +- Existing artifact: use current code, docs, telemetry, incidents, or diffs as context for the next engineering decision; do not wait for a finished artifact before guiding design, build, release, or operation. +- Missing details: state assumptions and say what to check next instead of blocking lifecycle guidance. + +## Exceptions + +- Early prototypes may use exploratory evals if they are not release checks. +- Human judgment can supplement automated scoring for high-impact or ambiguous tasks, but should use a written rubric. +- Low-risk copy changes may use a narrow regression set if output constraints and affected journeys are limited. + +## Response Quality Bar + +- Lead with the eval harness design, release check, failure triage, or threshold decision requested. +- Cover decision, cases, slices, scoring, thresholds, versioning, regression history, and failure handling before optional model discussion. +- Make recommendations actionable with dataset changes, grader rules, pass/fail criteria, and rerun policy where relevant. +- Name the details to inspect, such as eval cases, baseline runs, grader rubric, flake rate, slice results, versioned inputs, and failure log; do not state details you have not seen. +- Stay technology-agnostic by default: do not introduce provider, product, framework, database, protocol, or command names unless the user supplied them or explicitly requested tool-specific guidance. +- Stay inside model-backed evaluation checks. Route security, ML serving, or AI coding controls only when those risks dominate. +- Be concise: prefer eval matrices and release checks over generic eval theory. + +## Required Outputs + +- Eval harness specification. +- Case inventory with production, synthetic, edge, regression, and slice coverage. +- Scoring and grader rubric. +- Thresholds for pass, warn, block, and rollback. +- Versioned-input record. +- Failure triage table with disposition and next action. +- Regression history and case-promotion policy. + +## Checks Before Moving On + +- `decision_named`: eval result maps to a merge, release, rollback, or investigation decision. +- `case_coverage`: representative cases include critical tasks, slices, and known regressions. +- `scoring_defined`: checks, graders, rubrics, and tie-break rules are explicit. +- `thresholds_predeclared`: pass, warn, and block criteria are set before judging the change. +- `version_lineage`: prompts, model, data inputs, tools, eval cases, graders, and result are linked. + +## Red Flags - Stop And Rework + +- A single aggregate score hides critical slice regressions. +- The grader rubric changes between baseline and candidate. +- Eval cases are generated from the same prompt being tested with no independent check. +- Failures are waived without reason, and expiry. +- Production incidents do not become regression cases. + +## Common Mistakes + +| Mistake | Correction | +| --- | --- | +| Score first, threshold later | Set pass and block criteria before running. | +| Only happy-path cases | Add edge, adversarial, and regression cases. | +| Unversioned prompts | Link every input to every result. | +| Treating judge output as truth | Calibrate rubrics and inspect failures. | diff --git a/plugins/sirmarkz/staff-engineer-mode/specialists/llm-serving-cost-and-latency.md b/plugins/sirmarkz/staff-engineer-mode/specialists/llm-serving-cost-and-latency.md new file mode 100644 index 00000000..acfb1659 --- /dev/null +++ b/plugins/sirmarkz/staff-engineer-mode/specialists/llm-serving-cost-and-latency.md @@ -0,0 +1,151 @@ +--- +name: llm-serving-cost-and-latency +description: "Use when LLM routes need token budgets, latency budgets, cache strategy, fallback behavior, or cost attribution" +--- + +# LLM Serving Cost And Latency + +## Iron Law + +``` +EVERY LLM-BACKED ROUTE DECLARES A TOKEN BUDGET, A LATENCY BUDGET, AND A DEGRADATION PATH +``` + +A route without all three is uncontrolled spend and uncontrolled tail. The first cost spike, provider outage, or runaway retry will make that visible at the worst time. + +## Overview + +Produces a per-route token and latency budget table, a cache strategy spec for prompts, embeddings, and responses, a degradation policy that names the fallback model and the degraded contract, and a cost-attribution model that maps spend to route, feature, and tenant. Refuses to ship an LLM-backed route whose tail latency, retry behavior, or per-call cost is not modeled. + +**Core principle:** an LLM call is a remote, expensive, tail-latency-dominated dependency whose unit cost is set per request by the prompt the caller assembles. Treat the prompt, the model tier, the cache, and the fallback as production design choices, not implementation details. + +## When To Use + +- The user is designing, building, or operating a route, agent, or background job that calls a hosted or self-served language model. +- Spend on model inference is rising faster than traffic and the cause is unclear. +- p95 or p99 latency on an LLM-backed path is unacceptable to users and you is choosing between caching, batching, smaller models, streaming, or removing calls. +- A model provider had an outage or degraded response and the route had no fallback. +- You need a token budget per request before launching a feature that calls the model in a loop, in a tool-use pattern, or per item in a list. +- Prompt cache hit rate, embedding reuse, or response cache invalidation rules need to be defined. +- A retry policy is amplifying token spend on partial failures and needs bounding. +- A multi-tenant or multi-feature workload needs cost attribution because one consumer is hiding behind aggregate spend. + +## When Not To Use + +- The risk is prompt injection, tool-call exfiltration, retrieval-boundary leakage, or unsafe sinks; use `llm-application-security`. +- The work is dataset construction, graders, regression thresholds, or eval checks; use `llm-evaluation`. +- The model is a custom-trained or fine-tuned production ML model with training/serving skew, drift, and rollback as the dominant concern; use `ml-reliability-and-evaluation`. +- The conversation is generic backend latency, queueing, or saturation with no model-specific behavior; use `performance-and-capacity`. +- The conversation is generic dollar cost without LLM-specific token and tier choices; use `cost-aware-reliability`. +- The conversation is generic remote-call resilience that happens to call a model; use `dependency-resilience` for circuit breakers, timeouts, and idempotency once the LLM-specific budgets and fallback are set here. + +## Info To Gather + +- Current work phase, next decision, what is known, and assumptions where details are missing. +- Route inventory: each LLM-backed user-facing route, agent loop, and background job, with caller, expected QPS, peak factor, and tier. +- Per-route prompt structure: system prompt size, context inserted per request, retrieved-document size and count, conversation history retained, tool definitions included, and structured-output schema where used. +- Model tier choice per route: which model is the default, which is the fallback or smaller alternative, and whether cascading or routing across tiers is in use. +- Token accounting: input tokens, output tokens, cached or reused tokens, average and tail per request, and whether streaming is used. +- Latency profile: p50, p95, p99 end-to-end, time-to-first-token where streaming, and provider-side latency vs in-process overhead. +- Cache state: prompt-prefix cache, embedding cache, full-response cache, semantic cache, per-tenant scope, TTL, invalidation triggers, and observed hit rates. +- Retry and timeout policy: max retries, backoff, idempotency of the operation, and the per-retry token cost. +- Failure modes observed: provider 5xx, rate limits, partial completions, tool-call malformation, schema-validation failures, and the cost amplification of each. +- Cost data: spend by model, by route, by feature, by tenant where available, and the engineering unit each maps to. +- Degradation expectations: what user contract holds when the primary model is unavailable, slow, or rate-limited, and what the caller is allowed to fall back to. + +## Workflow + +1. **Enumerate the routes.** List every path where the model is called, including tool-call loops and background jobs. A route you forgot is the route that breaks the cost model. +2. **Set token budgets per route.** Define a per-request input-token cap, an expected output-token cap, and a hard cap that triggers a degraded response. The budget is a contract; the prompt assembler must enforce it. +3. **Set latency budgets per route.** Define p50, p95, and p99 end-to-end targets. For interactive routes, also define a time-to-first-token target if streaming is used. For background jobs, define a wall-clock deadline and a per-item cost ceiling. +4. **Choose the model tier deliberately.** Match the smallest acceptable model to the route's quality bar. State the fallback tier and the conditions that switch to it. Cascading from cheaper to more expensive models is allowed when the cheaper model has a measurable quality threshold; without that threshold, cascading just doubles the cost. +5. **Design the cache layers.** Distinguish prompt-prefix cache (provider-side, depends on stable prefix), embedding cache (deterministic per text plus model version), full-response cache (deterministic per prompt), and semantic cache (lossy, requires confidence threshold and false-hit budget). State scope and invalidation per layer; per-tenant scope is required where prompts contain tenant data. +6. **Bound retries and timeouts.** Set max retries, backoff, and a per-call timeout shorter than the upstream timeout. Confirm the operation is idempotent at the model layer or that retries are guarded by an idempotency key. Compute the worst-case token cost as cost-per-attempt times max attempts; that is the real per-request budget. +7. **Write the degradation policy.** For each route, state what happens when the primary model is unavailable, rate-limited, slower than the latency budget, or returns malformed output. Options include fallback model, cached response, cached approximate response, partial answer with explicit signaling, queued for later, or refused with a defined error contract. Silent fallback that changes user-visible quality without signaling is not allowed. +8. **Decide batching versus streaming.** For interactive routes, streaming usually wins on perceived latency at similar cost. For batch jobs, batching wins on throughput and per-token cost where the provider supports it; deadlines and partial-failure semantics must be explicit. +9. **Bound structured-output cost.** Schema-constrained output and tool calls amplify token cost when the model retries to satisfy a schema. Cap retries, validate cheaply before re-prompting, and treat schema-validation failure as a first-class failure mode with a separate counter. +10. **Attribute cost.** Tag every model call with route, feature, and tenant where applicable. Aggregate spend and tail latency per tag. A cost spike with no per-tag breakdown is a finding by itself. +11. **Add guardrails and alerts.** Alert on per-route token-budget breach, per-tenant cost anomaly, cache-hit-rate regression, fallback rate, retry amplification, and tail-latency regression after a model or prompt change. +12. **Rehearse the degraded path.** Periodically force fallback or refusal in a low-impact environment so the degraded contract is real, not theoretical. + +## Synthesized Default + +Set per-route token and latency budgets before launch. Choose the smallest acceptable model and a defined fallback. Cache aggressively at the layer that matches the determinism of the call: prompt prefix, embedding, full response, or scoped semantic. Bound retries and structured-output reattempts. Stream for interactivity, batch for throughput. Tag every call for attribution. Always have a degraded path and rehearse it. Treat the prompt assembler as a piece of production code with dedicated budget tests. + + + +## Phase Behavior + +- Ideation: identify risks, defaults, unknowns, options, and the next decision before code exists. +- Design: shape the target artifact, tradeoffs, checks, and details to gather. +- Development: guide sequencing, code boundaries, checks, and acceptance criteria. +- Testing: define release-blocking tests, evals, fixtures, and failure probes. +- Release: define rollout, observability, abort, rollback, and readiness details. +- Maintenance: define owners, drift checks, cleanup triggers, and refresh cadence. +- Existing artifact: use current code, docs, telemetry, incidents, or diffs as context for the next engineering decision; do not wait for a finished artifact before guiding design, build, release, or operation. +- Missing details: state assumptions and say what to check next instead of blocking lifecycle guidance. + +## Exceptions + +- A research or eval workload running offline may relax latency budgets if cost and deadline are explicit. +- A safety-critical refusal path (the model is being used as a guardrail) may waive the cheaper-fallback rule because falling back to a weaker model defeats the purpose. +- A high-determinism route may use full-response cache as the primary path and call the model only on cache miss; the budget then governs miss rate, not steady-state spend. +- A regulated workload may forbid certain cache layers because of data-handling constraints; record the exception and the resulting cost impact. +- A first prototype may run with provisional budgets, but the route may not be exposed to production traffic until the budgets, fallback, and attribution are real. + +## Response Quality Bar + +- Lead with the per-route budget table, cache strategy, degradation policy, or attribution model requested. +- Cover token budget, latency budget, model-tier choice, cache layers, retry and timeout bounds, degradation path, and attribution before optional model breadth. +- Make recommendations actionable with per-route numbers, cache scopes and TTLs, fallback conditions, retry caps, and the alerts that catch regression. +- Name the details to inspect, such as per-route token histograms, latency percentiles, cache hit rates, fallback rate, retry rate, and per-tag spend; do not state a budget without the data behind it. +- Stay technology-agnostic by default: do not introduce provider, product, framework, database, protocol, or command names unless the user supplied them or explicitly requested tool-specific guidance. +- Stay inside model-serving cost and latency. Route prompt-injection and tool-access risk, eval checks, generic backend performance, and generic dollar-cost optimization to the responsible specialist. +- Be concise: prefer compact route, cache, and fallback tables over generic LLM exposition. + +## Required Outputs + +- Per-route budget table with input-token cap, output-token cap, hard cap action, p50/p95/p99 latency target, and time-to-first-token target where streaming. +- Model-tier matrix per route with primary, fallback, and cascade conditions. +- Cache strategy spec covering prompt-prefix, embedding, full-response, and semantic caches with scope, TTL, invalidation, and observed or target hit rate per layer. +- Retry, timeout, and idempotency policy per route with computed worst-case token cost. +- Degradation policy per route covering primary unavailable, rate-limited, over-budget, and malformed-output cases, with the user-visible contract for each. +- Structured-output and tool-call cost bound: max validation retries, validation strategy, and the failure-mode counter. +- Cost-attribution model mapping spend to route, feature, and tenant, with the engineering unit each tag exposes. +- Alert and guardrail set: token-budget breach, tail-latency regression, cache-hit regression, fallback rate, retry amplification, and per-tenant cost anomaly. +- Rehearsal plan for the degraded path with cadence and verification path. + +## Checks Before Moving On + +- `token_budget_present`: every LLM-backed route has an input-token cap, an output-token cap, and a defined action when the cap is exceeded. +- `latency_budget_present`: every LLM-backed route has p50/p95/p99 targets and, where streaming, a time-to-first-token target. +- `model_tier_chosen`: every route names a primary model, a fallback model or refusal contract, and any cascade conditions. +- `cache_strategy_specified`: cache layers in use have scope, TTL, invalidation rule, and a target or measured hit rate. +- `retry_bound`: retry count, backoff, timeout, idempotency, and worst-case per-call token cost are computed. +- `degradation_path_specified`: each failure mode (unavailable, rate-limited, over-budget, malformed) has a user-visible contract and is rehearsed. +- `cost_attribution`: every call is tagged by route, feature, and tenant where applicable; spend can be sliced by tag. +- `tail_alerting`: alerts cover token-budget breach, tail-latency regression, cache-hit regression, fallback rate, retry amplification, and per-tenant cost anomaly. + +## Red Flags - Stop And Rework + +- The route has no input or output token cap and assembles its prompt by appending whatever the caller passes. +- Latency is reported as average only and the tail is not measured. +- A retry storm during a partial provider outage doubled or tripled spend and no retry cap or circuit broke the loop. +- Schema-constrained or tool-call output retries silently until success, with no reattempt cap. +- Cache "works" but hit rate is unmeasured; cost behavior changes when the prompt template is edited and nobody notices. +- The fallback path is documented but has never been exercised; provider failure produces a real outage. +- Spend is reported in aggregate only and per-tenant or per-feature cost cannot be sliced. +- Streaming is used for latency optics but the caller still waits for the full response before rendering. + +## Common Mistakes + +| Mistake | Correction | +| --- | --- | +| Picking the largest model by default | Match the smallest acceptable model to the route's quality bar; name the fallback. | +| Treating the prompt as free-form | Cap input tokens at the assembler and reject prompts that exceed the budget. | +| Caching without scope rules | Scope cache by tenant where prompts contain tenant data; state TTL and invalidation. | +| Unbounded retries on schema failures | Cap reattempts; treat schema failure as a counted failure mode. | +| Average-latency budgets | Budget p95 and p99; for interactive paths, also budget time-to-first-token. | +| No degraded contract | Define what the user sees when the primary model is unavailable; rehearse it. | +| Aggregate spend only | Tag every call by route, feature, and tenant; alert on per-tag anomalies. | +| Confusing cache layers | Distinguish prompt-prefix, embedding, full-response, and semantic caches; each has different determinism. | diff --git a/plugins/sirmarkz/staff-engineer-mode/specialists/migration-and-deprecation.md b/plugins/sirmarkz/staff-engineer-mode/specialists/migration-and-deprecation.md new file mode 100644 index 00000000..70bc2a77 --- /dev/null +++ b/plugins/sirmarkz/staff-engineer-mode/specialists/migration-and-deprecation.md @@ -0,0 +1,126 @@ +--- +name: migration-and-deprecation +description: "Use when retiring services, sunsetting APIs, replacing libraries, or migrating many callers with no-new-usage checks" +--- + +# Large-Scale Change And Service Deprecation + +## Iron Law + +``` +NO DEPRECATION WITHOUT REPLACEMENT, USAGE TELEMETRY, MIGRATION PATH, AND BACKSLIDING CONTROL +``` + +Warnings without migration machinery are just noise. + +## Overview + +Removing or replacing a widely used system is a production change spread across many dependents. + +**Core principle:** discover real usage, provide a safe replacement, migrate incrementally, prevent new usage, and remove only after usage signals show dependents are gone. + +## When To Use + +- The user asks to deprecate, sunset, retire, decommission, replace, or remove a service, API family, library, platform, data product, or capability. +- A broad migration crosses many projects, repositories, services, clients, tenants, or runtime dependents. +- A large mechanical change needs staged execution, generated edits, responsibility routing, and non-regression controls. +- New usage must be blocked while old usage is migrated away. + +## When Not To Use + +- The work is a routine dependency update, package bump, or small codemod; use `dependency-and-code-hygiene` instead. +- The work is API versioning for one service contract; use `api-design-and-compatibility` instead unless cross-system migration dominates. +- The work is database schema/backfill execution; use `database-operations` instead. +- The work is rollout sequencing for an already built change; use `progressive-delivery` instead. + +## Info To Gather + +- Current work phase, next decision, what is known, and assumptions where details are missing. +- Deprecated thing, replacement, reason, deadline, risk, and support window. +- Static references, runtime calls, traffic, tenants, clients, jobs, dashboards, alerts, docs, and third-party dependents. +- Migration path, compatibility layer, dual-read/write needs, validation checks, and rollback/escape hatch. +- Advisory versus compulsory policy, enforcement checks, exception process, and communication channel. +- Backsliding prevention: build rules, lint/static checks, visibility controls, change-time warnings, templates, and docs. +- Disable and removal checklist: feature toggles, traffic cutoffs, dark traffic, jobs, support tools, snapshots/exports, code, config, data, credentials, alerts, dashboards, runbooks, costs, and access paths. + +## Workflow + +1. **Define the end state.** State what is being removed, what replaces it, what remains supported, and why the change is worth doing. +2. **Discover usage.** Combine code search, dependency graph, runtime telemetry, logs, responsibility metadata, and consumer outreach. +3. **Classify dependents.** Separate easy mechanical users, risky dynamic users, abandoned critical paths, and external clients. +4. **Choose migration mode.** Use advisory deprecation for low-risk nudges; use compulsory deadlines when responsibility and enforcement exist. +5. **Provide paved migration.** Supply examples, compatibility shims, codemods, validation commands, and rollback/escape hatches. +6. **Prevent backsliding.** Block or warn on new usage through change-time checks, build visibility, templates, docs, and policy checks. +7. **Migrate incrementally.** Move dependents in batches small enough to understand, test, and roll back; track progress with objective metrics. +8. **Disable before delete.** Stop or quarantine old runtime paths, watch for at least one representative business cycle, check dark traffic, jobs, support tools, and alerts, and keep an escape hatch until the old path stays quiet. +9. **Retire completely.** Remove runtime paths, data, config, credentials, dashboards, alerts, runbooks, docs, and cost artifacts after usage reaches the removal check; preserve required snapshots/exports with retention, and disposal date. + +## Synthesized Default + +Treat deprecation as an engineered migration, not an announcement. Use centralized expertise for broad changes, automate repetitive edits, preserve compatibility while dependents move, enforce no-new-usage, and treat final decommissioning as a high-risk production deployment. + + + +## Phase Behavior + +- Ideation: identify risks, defaults, unknowns, options, and the next decision before code exists. +- Design: shape the target artifact, tradeoffs, checks, and details to gather. +- Development: guide sequencing, code boundaries, checks, and acceptance criteria. +- Testing: define release-blocking tests, evals, fixtures, and failure probes. +- Release: define rollout, observability, abort, rollback, and readiness details. +- Maintenance: define owners, drift checks, cleanup triggers, and refresh cadence. +- Existing artifact: use current code, docs, telemetry, incidents, or diffs as context for the next engineering decision; do not wait for a finished artifact before guiding design, build, release, or operation. +- Missing details: state assumptions and say what to check next instead of blocking lifecycle guidance. + +## Exceptions + +- Emergency removal may skip normal windows when security or data-loss risk dominates, but needs explicit impact analysis and repair plan. +- External public clients may require longer overlap, stronger telemetry, and contractual support windows. +- Advisory deprecation is acceptable for low-risk cleanup when maintenance cost is small and no deadline is required. +- Abandoned dependents may require a user decision, compatibility shim, or replacement before removal. + +## Response Quality Bar + +- Lead with the migration plan, deprecation decision, usage inventory, or retirement blocker requested. +- Cover replacement readiness, usage measurement, dependent batching, no-new-usage controls, exception policy, disable-before-delete, and final cleanup before optional change-management breadth. +- Make recommendations actionable with migration batches, validation checks, deadlines, stop criteria, escape hatches, and retirement checks where relevant. +- Name the details to inspect, such as static references, runtime telemetry, dependent replacement examples, block/warn controls, dark-traffic checks, and disposal records; do not state details you have not seen. +- Stay technology-agnostic by default: do not introduce provider, product, framework, database, protocol, or command names unless the user supplied them or explicitly requested tool-specific guidance. +- Stay inside engineered migration and deprecation. Route architecture redesign or vulnerability emergency handling only when those are the central unresolved risk. +- Be concise: avoid generic program-management language and prefer compact inventories, migration batch tables, and retirement checklists. + +## Required Outputs + +- Deprecation decision record with replacement, reason, and end state. +- Usage inventory with static and runtime checks. +- Dependent classification and migration batches. +- Migration guide, examples, validation, and escape hatch. +- Backsliding prevention controls. +- Enforcement, exception, and deadline policy. +- Disable-before-delete plan with watch-window results and disposal handling. +- Final retirement checklist. + +## Checks Before Moving On + +- `usage_inventory`: static and runtime usage are measured, or blind spots are named. +- `replacement_ready`: replacement path is documented, supported, and validated for representative dependents. +- `migration_batches`: dependents are grouped into maintained, linked, reversible batches. +- `backsliding_control`: new usage is blocked, warned, or explicitly exception-checked. +- `retirement_check`: disable-before-delete, watch-window, code, config, data, credentials, alerts, runbooks, docs, and cost artifacts are removed or retained with an explicit reason. + +## Red Flags - Stop And Rework + +- A deprecation warning has no replacement, deadline, or telemetry. +- New users can still copy old examples and add fresh dependencies. +- Migration success is counted by emails sent rather than usage removed. +- Removal happens before dark traffic, jobs, support tools, and external clients are checked. +- The old system keeps alerts, credentials, and costs after "retirement". + +## Common Mistakes + +| Mistake | Correction | +| --- | --- | +| Announcing instead of migrating | Provide tooling, examples, and maintained batches. | +| Relying only on static search | Add runtime telemetry for dynamic dependents. | +| Ignoring backsliding | Block new usage while old usage is removed. | +| Stopping at code deletion | Retire operational, data, access, and cost surfaces too. | diff --git a/plugins/sirmarkz/staff-engineer-mode/specialists/ml-reliability-and-evaluation.md b/plugins/sirmarkz/staff-engineer-mode/specialists/ml-reliability-and-evaluation.md new file mode 100644 index 00000000..cbb8aea9 --- /dev/null +++ b/plugins/sirmarkz/staff-engineer-mode/specialists/ml-reliability-and-evaluation.md @@ -0,0 +1,131 @@ +--- +name: ml-reliability-and-evaluation +description: "Use when ML model-serving changes or promotions need evals, data validation, drift, skew, rollback, or checks" +--- + +# ML Systems Reliability And Evaluation + +## Iron Law + +``` +NO MODEL PROMOTION WITHOUT DATA CHECKS, EVAL CHECKS, SERVING MONITORING, AND ROLLBACK +``` + +Offline accuracy alone is not production readiness. Promote an identifiable artifact, watch serving behavior, check drift and training-serving skew, and keep rollback ready. + +## Overview + +Production ML reliability is software reliability plus data reliability plus model behavior reliability. + +**Core principle:** promote models only when data, features, evals, serving behavior, rollout, monitoring, and rollback are all controlled. + +## When To Use + +- The user asks about production ML readiness, model serving, training pipelines, eval checks, feature validation, training-serving skew, drift, model rollout, or model rollback. +- A model artifact, feature pipeline, training job, or inference path is changing. +- The user needs to monitor model quality, prediction distribution, data drift, or serving latency. +- A launch or PRR includes ML behavior. + +## When Not To Use + +- The work is generic warehouse/ETL reliability with no model production concern; use `data-pipeline-reliability` instead. +- The request is broad AI policy or model strategy; out of scope unless framed as production engineering. +- The system is an LLM or agent app with prompt/tool security risk; use `llm-application-security` instead. +- The work is offline experimentation only and will not affect production. + +## Info To Gather + +- Current work phase, next decision, what is known, and assumptions where details are missing. +- Model use case, user impact, failure consequence, and production tier. +- Training data, feature definitions, schemas, labels, transform code, and serving data sources. +- Offline eval metrics, acceptance thresholds, slices/cohorts, fairness/safety checks where relevant, and regression history. +- Training-serving consistency checks, feature freshness, null/default behavior, and schema drift. +- Model artifact version, data version, config, dependencies, and rollout unit. +- Serving SLOs, latency, saturation, fallback behavior, monitoring, and rollback path. +- Drift, quality, feedback, incident, and human-review signals. + +## Workflow + +1. **Establish a non-ML baseline.** Confirm the system needs ML and has a deterministic fallback or baseline where appropriate. +2. **Validate data and features.** Check schema, ranges, missingness, distributions, freshness, and transform consistency. +3. **Check training-serving skew.** Compare feature generation, preprocessing, defaults, and dependency versions across training and serving. +4. **Define eval checks.** Use offline metrics, slice metrics, regression tests, adversarial/security checks, safety/business constraints, and minimum deltas for promotion. +5. **Version everything.** Link model artifact, code, features, data snapshot, config, eval result, and serving environment. +6. **Roll out progressively.** Use shadow, canary, cohort, percentage, or holdback where feasible; monitor serving and model behavior. +7. **Monitor production with thresholds.** Track serving SLOs, prediction distribution, feature drift, data freshness, quality proxies, feedback loops, and capacity or quota saturation; name alert thresholds or rollback triggers for at least two of those signals. +8. **Prepare rollback.** Keep prior artifact/config available and define when to rollback, disable, or route to baseline. + +## Synthesized Default + +Check ML releases on data validation, eval results, threat-informed failure-mode checks, training-serving consistency, versioned artifacts, progressive rollout, serving SLOs, thresholded drift and serving monitoring, and rollback. Treat model-only evaluation as insufficient for production readiness. + + + +## Phase Behavior + +- Ideation: identify risks, defaults, unknowns, options, and the next decision before code exists. +- Design: shape the target artifact, tradeoffs, checks, and details to gather. +- Development: guide sequencing, code boundaries, checks, and acceptance criteria. +- Testing: define release-blocking tests, evals, fixtures, and failure probes. +- Release: define rollout, observability, abort, rollback, and readiness details. +- Maintenance: define owners, drift checks, cleanup triggers, and refresh cadence. +- Existing artifact: use current code, docs, telemetry, incidents, or diffs as context for the next engineering decision; do not wait for a finished artifact before guiding design, build, release, or operation. +- Missing details: state assumptions and say what to check next instead of blocking lifecycle guidance. + +## Exceptions + +- Non-production exploration may use lighter checks if isolated and clearly not used for decisions. +- Some models lack immediate ground truth; use proxy metrics, delayed labels, human review, or guardrail metrics. +- High-risk decisions may require human-in-the-loop, additional safety checks, or stricter slice checks. +- Batch scoring may use pipeline freshness and output validation instead of synchronous serving latency. + +## Response Quality Bar + +- Lead with the launch decision, eval check, or model-risk blocker. +- Cover offline/online evals, guardrails, drift/skew, rollback, and monitoring before optional ML-platform breadth. +- Make recommendations actionable with checks, stop conditions, and rollback or shadow-mode criteria where relevant. +- Name the details to inspect, such as offline metrics, online guardrails, cohort slices, drift signals, and rollback checks; do not state details you have not seen. +- For production monitoring, give thresholded alerts or stop conditions for at least two signals, such as prediction drift, training-versus-serving feature distribution drift, serving latency, data freshness, saturation, or quota exhaustion. +- Stay technology-agnostic by default: do not introduce provider, product, framework, database, protocol, or command names unless the user supplied them or explicitly requested tool-specific guidance. +- Stay inside model-serving reliability and evaluation unless the prompt asks for broader product or research strategy. +- Be concise: avoid generic ML background and prefer compact eval and rollout matrices. + +## Required Outputs + +- ML production readiness checklist. +- Data and feature validation plan. +- Training-serving skew review. +- Offline and production eval check plan. +- AI/ML failure-mode and adversarial/security evaluation plan where misuse or dependency manipulation can affect users. +- Versioning and artifact lineage record. +- Model rollout and rollback plan. +- Drift, quality, freshness, serving latency, and capacity/quota monitoring requirements with alert thresholds and response paths. +- Incident path and residual risk notes. + +## Checks Before Moving On + +- `data_validation`: training and serving data have schema, freshness, distribution, and missingness checks. +- `eval_check`: promotion thresholds, regression checks, and slice criteria are stated. +- `skew_check`: training-serving feature and transform differences are checked. +- `version_lineage`: model, code, data, features, config, and eval result are linked. +- `monitoring_thresholds`: prediction drift, feature distribution drift, latency, freshness, saturation, or quota signals have alert thresholds and response paths. +- `rollback_check`: prior model or safe fallback is available with trigger criteria. + +## Red Flags - Stop And Rework + +- Offline aggregate accuracy is the only launch check. +- Feature generation differs between training and serving with no skew check. +- Model artifact cannot be tied to data, code, config, and eval result. +- Rollback requires retraining under incident pressure. +- Drift is monitored without a decision rule, threshold, or response path. +- Serving latency, capacity, or quota risk is discussed without alert thresholds. + +## Common Mistakes + +| Mistake | Correction | +| --- | --- | +| Treating ML as only a model file | Include data, features, serving, evals, rollout, and monitoring. | +| Ignoring slices | Evaluate important cohorts and failure-sensitive segments. | +| Waiting for labels only | Use proxy and delayed-quality signals where ground truth lags. | +| No fallback | Keep prior model, rule baseline, or disable path where impact warrants it. | +| Vague monitoring | Add alert thresholds for drift, feature distribution, latency, and capacity or quota signals. | diff --git a/plugins/sirmarkz/staff-engineer-mode/specialists/mobile-release-engineering.md b/plugins/sirmarkz/staff-engineer-mode/specialists/mobile-release-engineering.md new file mode 100644 index 00000000..f5e85b0a --- /dev/null +++ b/plugins/sirmarkz/staff-engineer-mode/specialists/mobile-release-engineering.md @@ -0,0 +1,126 @@ +--- +name: mobile-release-engineering +description: "Use when planning mobile rollouts needing staged release, stability checks, offline behavior, or kill switches" +--- + +# Mobile Release Engineering And Crash Budgets + +## Iron Law + +``` +NO BROAD MOBILE ROLLOUT WITHOUT STABILITY BUDGETS, SEGMENTED TELEMETRY, HALT CRITERIA, AND FORWARD-FIX PLAN +``` + +If the release cannot be halted or repaired under app-store/client constraints, do not widen exposure. + +## Overview + +Mobile releases are hard to roll back, so stability checks must be conservative before broad rollout. + +**Core principle:** use staged rollout, crash/hang budgets, device/OS segmentation, startup/offline checks, privacy-safe telemetry, and forward-fix plans. + +## When To Use + +- The user is planning, changing, or reviewing a native mobile release train, staged rollout, phased release, crash-free users/sessions, hang rates, startup, offline behavior, mobile telemetry, or app-store release risk. +- A mobile app release could affect stability across devices, OS versions, networks, or app versions. +- A mobile rollout needs thresholds to continue, halt, or forward-fix. +- Client upgrade lag or rollback limits change release strategy. + +## When Not To Use + +- The request is responsive web or browser performance; use `web-release-gates` instead. +- The issue is backend-only latency or availability; use `performance-and-capacity` or `slo-and-error-budgets` instead. +- The work is mobile product strategy, acquisition, store listing optimization, or UX roadmap. +- The question is general CI check policy without mobile release constraints; use `testing-and-quality-gates` instead. + +## Info To Gather + +- Current work phase, next decision, what is known, and assumptions where details are missing. +- Platforms, release train, app versions, staged rollout percentages, and store review constraints. +- Stability metrics: crash-free users/sessions, hang rate, startup failures, fatal/non-fatal error rate, and watchdog events. +- Device/OS/app-version/network segmentation and known high-risk cohorts. +- Critical journeys, offline behavior, sync/data-loss risk, and backend compatibility. +- Telemetry fields, privacy controls, symbolication/deobfuscation, and alerting thresholds. +- Rollback, halt, kill switch, remote config, and forward-fix options. + +## Workflow + +1. **Define mobile SLIs.** Use crash-free users/sessions, hang rate, startup success, and critical journey success. +2. **Segment the rollout.** Check by platform, app version, device class, OS version, geography/network, or cohort where risk warrants it. +3. **Set staged thresholds.** Define metrics and sample-size requirements for each widening step. +4. **Use explicit stability checks.** If local budgets are missing, propose provisional checks with windows: crash-free users at least 99.5%, crash-free sessions at least 99.9%, hang/ANR rate no worse than baseline plus 10% and below the app's severe-alert threshold, measured over each 24-hour rollout step before widening. +5. **Check compatibility.** Verify backend, API, schema, feature flag, and config compatibility with old and new app versions. +6. **Plan offline and sync behavior.** Test intermittent network, stale config, retry, conflict, and data-loss scenarios. +7. **Protect privacy.** Avoid sensitive data in crash reports, logs, breadcrumbs, and custom keys. +8. **Define halt/repair.** Decide when to halt rollout, disable features, revert server flags, or submit a forward fix. +9. **Monitor long tail.** Track old versions and slow adoption after the main rollout completes. + +## Synthesized Default + +Use staged mobile rollout with crash-free, hang, startup, and critical-journey budgets as release checks. Account for slow upgrade curves and limited rollback by keeping kill switches, compatibility windows, and forward-fix paths ready. + + + +## Phase Behavior + +- Ideation: identify risks, defaults, unknowns, options, and the next decision before code exists. +- Design: shape the target artifact, tradeoffs, checks, and details to gather. +- Development: guide sequencing, code boundaries, checks, and acceptance criteria. +- Testing: define release-blocking tests, evals, fixtures, and failure probes. +- Release: define rollout, observability, abort, rollback, and readiness details. +- Maintenance: define owners, drift checks, cleanup triggers, and refresh cadence. +- Existing artifact: use current code, docs, telemetry, incidents, or diffs as context for the next engineering decision; do not wait for a finished artifact before guiding design, build, release, or operation. +- Missing details: state assumptions and say what to check next instead of blocking lifecycle guidance. + +## Exceptions + +- Emergency security or compliance fixes may move faster, but staged telemetry and rollback/forward-fix criteria still apply. +- Very small internal distributions can use lighter checks if users and devices are known. +- Some app-store constraints force forward-fix rather than rollback; document this before broad rollout. +- Privacy constraints may limit telemetry detail; preserve enough aggregate signal to detect regressions. + +## Response Quality Bar + +- Lead with the staged rollout decision, halt criteria, or stability budget requested. +- Cover crash-free, hangs, startup, critical journey, segmentation, and repair path before optional mobile release topics. +- Include numeric stability thresholds and measurement windows when recommending rollout checks; clearly label provisional defaults if the user has not supplied project-specific budgets. +- Make recommendations actionable with checks, stop conditions, and forward-fix or kill-switch actions where relevant. +- Name the details to inspect, such as crash-free sessions/users, OS/device cohorts, sample sizes, app versions, and telemetry readiness; do not state details you have not seen. +- Stay technology-agnostic by default: do not introduce provider, product, framework, database, protocol, or command names unless the user supplied them or explicitly requested tool-specific guidance. +- Stay inside mobile release risk. Mention backend/API/config compatibility only where it affects client rollout safety. +- Be concise: avoid generic mobile-release background and prefer compact rollout tables. + +## Required Outputs + +- Mobile release train and staged rollout plan. +- Crash-free users/sessions, hang/ANR, startup, and critical-journey budgets with numeric thresholds and measurement windows. +- Device/OS/app-version segmentation plan. +- Backend/API/config compatibility plan. +- Offline/sync test and telemetry plan. +- Halt, rollback, kill-switch, and forward-fix criteria. +- Privacy-safe mobile telemetry checklist. + +## Checks Before Moving On + +- `stability_budget`: crash-free, hang, startup, and critical journey thresholds are defined. +- `segment_check`: device, OS, app version, and network/cohort segmentation is considered. +- `compatibility_check`: backend, API, config, and old-version compatibility are addressed. +- `halt_fix_check`: rollout halt, kill switch, rollback, or forward-fix path is explicit. +- `privacy_check`: crash/log telemetry avoids sensitive data and has symbolication/debuggability path. + +## Red Flags - Stop And Rework + +- Release goes to 100 percent before stability metrics have sample size. +- Only aggregate crash rate is watched; device/OS cohorts are ignored. +- Backend changes break older app versions. +- Crash reports include sensitive data. +- Rollback is assumed even though client distribution cannot force downgrade. + +## Common Mistakes + +| Mistake | Correction | +| --- | --- | +| Treating mobile like web deploys | Account for store review, upgrade lag, and rollback limits. | +| Aggregate stability only | Segment by platform, device, OS, app version, and cohort. | +| Ignoring offline | Test sync, retry, stale config, and conflict behavior. | +| No forward-fix plan | Prepare kill switches, server flags, and patched release path. | diff --git a/plugins/sirmarkz/staff-engineer-mode/specialists/observability-and-alerting.md b/plugins/sirmarkz/staff-engineer-mode/specialists/observability-and-alerting.md new file mode 100644 index 00000000..5cd11526 --- /dev/null +++ b/plugins/sirmarkz/staff-engineer-mode/specialists/observability-and-alerting.md @@ -0,0 +1,136 @@ +--- +name: observability-and-alerting +description: "Use when telemetry, dashboards, alert rules, or runbooks need design outside SLO or release-check policy" +--- + +# Observability And Alerting + +## Iron Law + +``` +TELEMETRY STARTS FROM USER SYMPTOMS; URGENT ALERTS NEED USER IMPACT, URGENCY, ACTIONABILITY, AND A RUNBOOK +``` + +Telemetry that does not map to a user-visible symptom is decoration. An alert that lacks impact, urgency, actionability, or a runbook should not be urgent by default. The two halves are co-designed: signals exist so that someone can act on them, and urgent alerts fire only on signals that show user-felt impact. + +## Overview + +Produces telemetry requirements tied to user journeys, a dashboard specification that answers impact and recent change, and an alert policy where every urgent alert has user impact, urgency, actionability, and a runbook. Refuses host-health urgent alerts, anonymous alerts, and dashboards built from whatever the platform happened to emit. + +**Core principle:** instrument user-visible symptoms first, then add enough causal context to debug without guessing. + +## When To Use + +- The user is designing, building, or revising metrics, logs, traces, dashboards, alerting, runbooks, correlation IDs, telemetry fields, or production debugging paths. +- A service cannot explain incidents from existing signals. +- The user asks how to instrument a new service, dependency, queue, pipeline, or rollout. +- Alert rules are the main deliverable and the work is not asking to connect them to SLO or error-budget policy. + +## When Not To Use + +- The user needs reliability targets, SLO math, SLO-based urgent/follow-up policy, or budget policy; use `slo-and-error-budgets` instead. +- The user needs to reduce existing urgent-alert volume or toil; use `oncall-health` instead unless new telemetry is central. +- The user is in a live incident; route to `incident-response-and-postmortems` first. +- The work is only local development logging without production operations impact. + +## Info To Gather + +- Current work phase, next decision, what is known, and assumptions where details are missing. +- Critical user journeys, SLOs, service tier, and incident history. +- Request paths, dependency map, queues, data stores, batch jobs, and external integrations. +- Existing metrics, logs, traces, dashboards, alerts, runbooks, and known blind spots. +- Dashboard purpose, first-screen health question, metric definitions, missing-signal behavior, visual status rules, and alert-to-runbook links. +- Fault-domain labels needed for impact analysis, such as location, deployment unit, partition, shard, tenant, and deployment stage. +- Deployment markers, version identifiers, feature/config flags, tenant/customer context, and correlation identifiers. +- Privacy constraints, sensitive fields, retention requirements, and sampling limits. +- Responder workflow: where urgent alerts go, what local response path handles them, and how runbooks are used. + +## Workflow + +1. **Start with symptoms.** Define what users notice: failed requests, slow actions, stale data, dropped work, lost messages, or incorrect results. +2. **Add golden signals.** Capture latency, traffic, errors, and saturation for services; utilization, saturation, and errors for resources. +3. **Instrument dependencies.** Include call count, latency, errors, timeouts, retries, queue depth, queue age, and drain rate. +4. **Connect events.** Propagate trace context across every service boundary so the trace identifier is global to a request and span identifiers are local to each unit of work; attach deployment/change markers. +5. **Structure logs and events.** Require a baseline field set on every entry — UTC timestamp, severity, service identifier, trace identifier, request identifier, and message — plus stable fields for operation, tenant/customer context where safe, dependency, result, error class, and latency. +6. **Define the health model.** State healthy, degraded, unavailable, and recovering conditions at component, dependency, journey, and workload levels; distinguish transient degradation from sustained unavailability. +7. **Design dashboards for questions.** Build the first view so impact is visible quickly, then drill down by scope, fault domain, recent changes, dependencies, saturation, and recovery progress. Every displayed metric needs unit, source, label semantics, threshold/window, and missing-data behavior; color cannot be the only status signal. +8. **Make absent signals explicit.** Emit zero when zero is meaningful, and treat missing samples as a separate health state instead of letting silence look healthy. +9. **Alert on symptoms.** Use SLO burn or direct user-impact alerts. Keep diagnostic and causal alerts as follow-ups unless urgent and actionable. +10. **Identify affected customers safely.** For customer-impacting services, define privacy-safe signals that support impact scoping and notification. +11. **Attach runbooks.** Every urgent alert needs triage steps, impact check, mitigation options, fallback path, and rollback/fallback links. + +## Synthesized Default + +Use SLO/user-journey symptoms, layered health models, golden signals, fault-domain labels, structured events, distributed context, deployment markers, and dependency signals as the default telemetry set. Use urgent alerts only when action is required now; use dashboards and follow-ups for investigation and slow-burn work. + + + +## Phase Behavior + +- Ideation: identify risks, defaults, unknowns, options, and the next decision before code exists. +- Design: shape the target artifact, tradeoffs, checks, and details to gather. +- Development: guide sequencing, code boundaries, checks, and acceptance criteria. +- Testing: define release-blocking tests, evals, fixtures, and failure probes. +- Release: define rollout, observability, abort, rollback, and readiness details. +- Maintenance: define owners, drift checks, cleanup triggers, and refresh cadence. +- Existing artifact: use current code, docs, telemetry, incidents, or diffs as context for the next engineering decision; do not wait for a finished artifact before guiding design, build, release, or operation. +- Missing details: state assumptions and say what to check next instead of blocking lifecycle guidance. + +## Exceptions + +- Early-stage services may begin with a minimal symptom dashboard and expand after real failure modes are known. +- Low-volume systems may need synthetic checks or heartbeat/freshness signals to detect user impact. +- Security and privacy constraints may require redaction, hashing, sampling, or separate audit trails. +- Some critical causal signals can trigger urgent alerts if they are tested leading indicators with a runbook. + +## Response Quality Bar + +- Lead with the dashboard spec, alert classification, telemetry gap, or runbook requirement requested. +- Cover user journeys, health states, golden signals, dependency context, deployment markers, privacy-safe events, urgent-alert policy, and runbooks before optional observability breadth. +- Make recommendations actionable with metric/log/trace names, thresholds, routes, runbook links, failure response, and rollout checks where relevant. +- Name the details to inspect, such as SLOs, metric sources, log fields, trace context, alert history, runbook content, deploy markers, and sensitive-data handling; do not state details you have not seen. +- Stay technology-agnostic by default: do not introduce provider, product, framework, database, protocol, or command names unless the user supplied them or explicitly requested tool-specific guidance. +- Stay inside observability and alerting. Route SLO definition, on-call policy, or incident response only when those are the central unresolved risk. +- Be concise: avoid generic telemetry lists and prefer compact journey-to-signal and alert-policy tables. + +## Required Outputs + +- Telemetry requirements mapped to user journeys and dependencies. +- Dashboard specification for impact, scope, dependencies, saturation, and recent changes. +- Metric definition table covering unit, source, labels, threshold/window, owner path, and missing-signal behavior. +- Fault-domain and affected-customer scoping signals where relevant. +- Alert policy with urgent/follow-up/diagnostic classification. +- Structured log/event field standard and sensitive-data handling. +- Trace or context propagation requirements. +- Runbook requirements for every urgent alert. +- Gaps and follow-up routes to SLO, on-call, incident, or platform work. + +## Checks Before Moving On + +- `symptom_first`: urgent alerts map to SLO burn or direct user-visible impact. +- `health_model`: component and dependency signals aggregate into critical-journey and workload health states. +- `causal_context`: telemetry includes dependency, correlation, version/change, and saturation context. +- `fault_domain_context`: telemetry can separate impact by location, deployment unit, partition, shard, tenant, or deployment stage where those domains exist. +- `dashboard_scan`: the first dashboard view shows user impact quickly and supports drill-down by scope, fault domain, dependency, change, and recovery state. +- `metric_definition`: user-facing metrics define unit, source, labels, threshold/window, and missing-signal behavior. +- `missing_signal_behavior`: missing samples and zero values are distinguishable where that difference changes health. +- `runbook_link`: every urgent alert has a runbook with impact check, mitigation, fallback, and verification. +- `privacy_check`: sensitive data handling is defined for logs, traces, labels, and events. +- `debug_path`: dashboards answer impact, scope, cause candidates, recent changes, and recovery state. + +## Red Flags - Stop And Rework + +- Dashboards start from whatever the platform emits instead of user journeys. +- Every dependency error triggers an urgent alert even when retries hide user impact. +- Logs contain sensitive data or unbounded high-cardinality fields without controls. +- Alerts have no runbook or response path. +- Metrics show averages only and hide tail latency or saturation. + +## Common Mistakes + +| Mistake | Correction | +| --- | --- | +| Collecting everything | Collect signals that answer operational questions. | +| Urgent alerts on causes | Alert on symptoms; use causes for debugging. | +| Ignoring changes | Add deployment, config, and feature markers. | +| Logging prose | Use stable structured fields. | diff --git a/plugins/sirmarkz/staff-engineer-mode/specialists/oncall-health.md b/plugins/sirmarkz/staff-engineer-mode/specialists/oncall-health.md new file mode 100644 index 00000000..240d53e6 --- /dev/null +++ b/plugins/sirmarkz/staff-engineer-mode/specialists/oncall-health.md @@ -0,0 +1,134 @@ +--- +name: oncall-health +description: "Use when pages, suppression rules, toil, runbook gaps, or recurring manual ops are hurting responders" +--- + +# Oncall Health And Toil Reduction + +## Iron Law + +``` +NO RECURRING PAGE OR MANUAL RUNBOOK STEP WITHOUT A FIX PATH AND ELIMINATION PLAN +``` + +If the same alert or manual operation keeps recurring, the system is asking for engineering work. + +## Overview + +Repeated pages and manual operations are engineering defects. + +**Core principle:** keep pages urgent and actionable, convert repeated manual work into durable fixes, and protect responders from avoidable operational load. + +## When To Use + +- The user is designing or revising paging alerts, toil, runbook, suppression, escalation, or manual-operation decisions that affect responder load. +- The user asks to reduce pages, alert fatigue, toil, manual operations, repeated runbook work, or operational burden. +- On-call responders are paged by non-urgent, unactionable, duplicate, or noisy alerts. +- Manual mitigations are repeated often enough to automate or remove. +- Runbooks are missing, stale, unsafe, or too vague to execute under pressure. + +## When Not To Use + +- The user asks about staffing, compensation, rotation fairness, headcount, or HR process; out of scope unless reframed as technical toil reduction. +- The main deliverable is new telemetry or alert construction; use `observability-and-alerting` instead. +- The main work is defining SLOs or paging thresholds from scratch; use `slo-and-error-budgets` instead. +- The request is generic developer productivity with no operational pain; out of scope for routed specialists. + +## Info To Gather + +- Current work phase, next decision, what is known, and assumptions where details are missing. +- Paging history: alert name, count, time, severity, duration, action taken, user impact, and fix path. +- Toil inventory: manual, repetitive, automatable, tactical, page-driven work. +- Runbooks, fallback paths, responsibility, checkpoint notes, and incident/postmortem actions. +- Alert-to-runbook reachability, runbook freshness, impact check, mitigation path, and verification step for paging alerts. +- Alert policy: paging versus non-paging response, SLO mapping, diagnostic alerts, dedupe, grouping, and suppression. +- Automation candidates, recurring incident classes, platform gaps, and unsafe manual steps. +- Responder load: after-hours pages, sleep-impacting pages, unresolved alerts, and checkpoint friction. + +## Workflow + +1. **Classify pages.** Mark each paging alert as urgent/actionable/user-visible/novel, non-paging, diagnostic, duplicate, stale, or false positive. +2. **Find top load sources.** Rank by page count, duration, user impact, recurrence, and manual effort. +3. **Separate symptom from cause.** Keep user-impact pages, but remove duplicate cause alerts unless they drive distinct action. +4. **Fix runbooks.** Every paging alert needs a reachable, current runbook with impact check, mitigation, fallback, rollback, and verification. +5. **Eliminate toil.** Automate, self-heal, remove, or redesign repeated manual operations; do not just document them better. +6. **Create an engineering backlog.** Give every recurring class a priority, expected page reduction, and verification metric. +7. **Protect the signal.** Use SLO burn, grouping, dedupe, maintenance windows, and non-paging routing to prevent alert erosion. +8. **Set a page-rate budget.** State a numeric per-shift or per-week page target and how it will be measured. Compare against current rate. +9. **Check runbook freshness.** For every paging alert, record runbook last-verified date and require freshness cadence alongside coverage. +10. **Refresh regularly.** Feed incident/postmortem findings back into alert rules, platform work, and reliability standards. + +## Synthesized Default + +Pages should be urgent, actionable, user-visible, and novel. Everything else should become a non-paging follow-up, automation, grouping, suppression, or removal. Toil reduction should produce engineering work with measured page or manual-effort reduction, and live-site responsibility should feed back into product engineering priorities. + + + +## Phase Behavior + +- Ideation: identify risks, defaults, unknowns, options, and the next decision before code exists. +- Design: shape the target artifact, tradeoffs, checks, and details to gather. +- Development: guide sequencing, code boundaries, checks, and acceptance criteria. +- Testing: define release-blocking tests, evals, fixtures, and failure probes. +- Release: define rollout, observability, abort, rollback, and readiness details. +- Maintenance: define owners, drift checks, cleanup triggers, and refresh cadence. +- Existing artifact: use current code, docs, telemetry, incidents, or diffs as context for the next engineering decision; do not wait for a finished artifact before guiding design, build, release, or operation. +- Missing details: state assumptions and say what to check next instead of blocking lifecycle guidance. + +## Exceptions + +- Some pre-user-impact alerts may page if they are tested leading indicators with a safe, immediate mitigation. +- Low-tier internal systems may route most operational signals to non-paging follow-up if user impact is limited and the user accepts the response latency. +- Temporary noisy alerts are allowed during a risky migration only with expiry, and cleanup task. +- Staffing and compensation questions remain out of scope unless translated into technical page/toil reduction. + +## Response Quality Bar + +- Lead with the page classification, toil inventory, alert-change decision, or automation backlog requested. +- Cover urgency, actionability, user visibility, novelty, runbook quality, repeated manual work, responsibility, and measurement before optional on-call breadth. +- Make recommendations actionable with page/follow-up/remove decisions, runbook fixes, automation tasks, expiry dates, and measured reduction targets where relevant. +- Name the details to inspect, such as alert history, pages per responder, after-hours volume, runbook links, toil hours, manual steps, suppression rules, and incident outcomes; do not state details you have not seen. +- Stay technology-agnostic by default: do not introduce provider, product, framework, database, protocol, or command names unless the user supplied them or explicitly requested tool-specific guidance. +- Stay inside technical on-call health and toil. Mark staffing, compensation, and HR questions out of scope unless translated into engineering controls. +- Be concise: avoid generic on-call advice and prefer compact page inventories and remediation backlogs. + +## Required Outputs + +- Paging-alert inventory and classification. +- Top toil sources with frequency, effort, and removal path. +- Alert changes: page/follow-up/remove/group/dedupe/suppress decisions. +- Runbook gap list and required updates. +- Automation or redesign backlog with expected page/manual-effort reduction. +- Responsibility and fallback fixes. +- Measurement plan for page volume, after-hours pages, and toil hours. +- Numeric page budget per shift or week with the measurement window and source. +- Runbook coverage AND freshness check (last-verified date, freshness cadence) for each paging alert. +- Alert-to-runbook path showing each paging alert reaches a current runbook with impact check, mitigation, and verification. + +## Checks Before Moving On + +- `paging_classification`: each paging alert is classified by urgency, actionability, user visibility, and novelty. +- `toil_inventory`: repeated manual work has frequency, and elimination or automation plan. +- `runbook_check`: remaining paging alerts link to executable runbooks with mitigation and verification. +- `alert_runbook_path`: each paging alert links to a reachable, current runbook with impact check, mitigation path, and verification step. +- `noise_reduction`: proposed changes state expected page or toil reduction and how it will be measured. +- `scope_check`: staffing, compensation, and HR issues are reframed or marked out of scope. +- `page_budget`: a numeric per-shift or per-week page target is stated with measurement window. +- `runbook_freshness`: each paging alert has a last-verified date and a freshness cadence. + +## Red Flags - Stop And Rework + +- The solution is "make the runbook longer" for repeated manual work. +- A paging alert has no action other than "look at dashboard". +- Responders routinely silence, ignore, or rerun alerts. +- Every cause alert pages alongside the symptom alert. +- Alert reduction removes the only user-impact signal. + +## Common Mistakes + +| Mistake | Correction | +| --- | --- | +| Treating pages as inevitable | Treat avoidable pages as engineering defects. | +| Automating bad operations | Remove or redesign unsafe manual work when possible. | +| Deleting noisy alerts blindly | Preserve user-impact coverage and verify replacement signal. | +| Measuring only count | Track after-hours load, duration, recurrence, and toil hours. | diff --git a/plugins/sirmarkz/staff-engineer-mode/specialists/performance-and-capacity.md b/plugins/sirmarkz/staff-engineer-mode/specialists/performance-and-capacity.md new file mode 100644 index 00000000..16401acd --- /dev/null +++ b/plugins/sirmarkz/staff-engineer-mode/specialists/performance-and-capacity.md @@ -0,0 +1,135 @@ +--- +name: performance-and-capacity +description: "Use when tail latency, load tests, saturation, capacity, headroom, or peak/failover traffic need analysis" +--- + +# Capacity Performance And Tail Latency + +## Iron Law + +``` +NO CAPACITY OR PERFORMANCE PLAN WITHOUT A TRAFFIC MODEL, TAIL METRIC, SATURATION SIGNAL, AND LOAD-TEST RESULTS +``` + +If the answer only says "scale horizontally" or reports averages, it is not enough. + +## Overview + +Users experience tail latency, not averages. + +**Core principle:** model demand, concurrency, queueing, saturation, and fanout, then test to the knee of the curve before production finds it. + +## When To Use + +- The user asks about p95, p99, p99.9, throughput, QPS, concurrency, queueing, saturation, hot paths, or scaling limits. +- A release caused latency or throughput regression. +- A launch, PRR, or migration needs capacity test results. +- The system needs load, stress, spike, soak, or failure-condition testing. +- Cost is discussed as a capacity/headroom tradeoff rather than a billing support question. + +## When Not To Use + +- The main problem is retries, timeouts, or dependency failure safety; use `dependency-resilience` instead. +- The main request is public edge abuse, denial-of-service defense, or application-layer filtering; use `edge-traffic-and-ddos-defense` instead. +- The user asks pure billing/procurement questions; out of scope. +- The work is SLO target selection without performance investigation; use `slo-and-error-budgets` instead. +- The regression is explicitly tied to a query plan, index, or schema migration; use `database-operations` instead. +- The request is browser field/lab release checks for a frontend rollout; use `web-release-gates` instead. + +## Info To Gather + +- Current work phase, next decision, what is known, and assumptions where details are missing. +- User journeys, SLOs, latency percentiles, throughput targets, and acceptable degradation behavior. +- Traffic model: current, peak, forecast, burstiness, tenant skew, payload size, and fanout. +- Resource signals: CPU, memory, IO, network, lock contention, connection pools, thread pools, queue depth, queue age, and GC. +- Load-balancing behavior, locality, shard keys, hot partitions, cache hit rate, and downstream quotas. +- Existing load tests, production incidents, profiling/flame graphs, and regression data. +- Tested breakpoint, startup-to-ready time, recovery time after stress, and profile differences between normal and heavy load. +- Headroom rule, autoscaling behavior, static failed-domain capacity, and unit-cost constraints. + +## Workflow + +1. **Frame the answer before inspection.** Start with a compact provisional check frame: target percentile and boundary; load-test method with scenarios and pass/stop criteria; headroom plus USE signal; overload mechanism and priority; queue-depth or in-flight work metric plus backpressure; hot-path/key hypothesis plus mitigation. Mark unknowns and refine them after investigation. +2. **Define the user-visible target.** Choose p95/p99/p99.9 and throughput targets that map to SLOs or launch requirements. +3. **Build the demand model.** Capture request rate, burstiness, concurrency, fanout, payload, tenant skew, and seasonal peaks. +4. **Apply queueing sanity checks.** Use Little's Law to connect arrival rate, latency, and concurrency; identify queues that can hide saturation. +5. **Find saturation points.** Track RED for services and USE for resources. Include locks, connection pools, thread pools, caches, and downstream quotas. +6. **Test to the knee.** Run load/stress/spike/soak tests in production-like environments until latency or errors become nonlinear; record the breakpoint, startup-to-ready time, recovery behavior after stress, and the profile differences that explain bottlenecks. +7. **Protect the system.** Define admission control, load shedding, prioritization, and graceful degradation before saturation. +8. **Investigate regressions scientifically.** Compare before/after profiles, deploy markers, dependency metrics, cache behavior, and resource saturation. +9. **Model failed-domain headroom.** For HA requirements, show remaining domains have enough already-available capacity at peak; do not count emergency scaling as the primary recovery mechanism. +10. **Tie capacity to cost when relevant.** Preserve required headroom and failover capacity; optimize unit economics only after risk is explicit. + +## Synthesized Default + +Optimize around tail percentiles, saturation, queue age, and headroom rather than averages. Combine tail-at-scale design, SRE golden signals, performance baselines, load-shedding practice, and unit-cost discipline when cost is explicitly part of the reliability tradeoff. + + + +## Phase Behavior + +- Ideation: identify risks, defaults, unknowns, options, and the next decision before code exists. +- Design: shape the target artifact, tradeoffs, checks, and details to gather. +- Development: guide sequencing, code boundaries, checks, and acceptance criteria. +- Testing: define release-blocking tests, evals, fixtures, and failure probes. +- Release: define rollout, observability, abort, rollback, and readiness details. +- Maintenance: define owners, drift checks, cleanup triggers, and refresh cadence. +- Existing artifact: use current code, docs, telemetry, incidents, or diffs as context for the next engineering decision; do not wait for a finished artifact before guiding design, build, release, or operation. +- Missing details: state assumptions and say what to check next instead of blocking lifecycle guidance. + +## Exceptions + +- Batch pipelines may use freshness and completion latency instead of request p99; route to data pipeline reliability when the system is mainly ETL. +- Internal low-tier tools may use lower headroom or follow-up-only alerts when the user accepts the SLO tradeoff. +- Hedged requests can reduce tail latency only when extra load is budgeted and duplicate work is safe. +- Predictive scaling helps predictable demand, but cold-start latency must not sit on a critical synchronous path. + +## Response Quality Bar + +- Lead with the capacity model, tail-latency diagnosis, load-test plan, or headroom decision requested. +- Cover traffic shape, fanout, tail budgets, saturation signals, load shedding, test results, failure-domain headroom, and cost tradeoffs when relevant before optional performance breadth. +- Make recommendations actionable with thresholds, test scenarios, stop criteria, scaling limits, rollback actions, and regression checks where relevant. +- Name the details to inspect, such as p95/p99 metrics, peak/burst traffic, concurrency, queue age, resource saturation, downstream limits, load-test results, and unit cost; do not state details you have not seen. +- Stay technology-agnostic by default: do not introduce provider, product, framework, database, protocol, or command names unless the user supplied them or explicitly requested tool-specific guidance. +- Stay inside capacity, performance, and tail latency. Route data pipelines, dependency resilience, or FinOps only when they materially change the decision. +- Be concise: avoid generic performance advice and prefer compact capacity models, latency budgets, and test matrices. + +## Required Outputs + +Every answer — including narrow regression diagnoses — must state, in this order: + +1. **Target at user boundary**: numeric latency/throughput target, percentile (p95/p99/p99.9), and the measurement boundary (edge, gateway, service ingress). Mark unknown explicitly. +2. **Load-test methodology**: name the method (synthetic load, traffic shadow, prod replay), the scenarios (normal/peak/burst/soak), and pass/stop criteria. +3. **Headroom and saturation (USE)**: required headroom percentage and the saturation indicator(s) tracked (utilization, queue depth, queue age, pool wait, drain rate). +4. **Overload behavior**: load-shedding or admission-control mechanism AND which traffic class is preserved by priority. +5. **Queue/backpressure model** for any asynchronous path: queue-depth metric and the backpressure response. +6. **Hot-path / hot-key analysis**: the suspected hot path or hot key and its mitigation. +7. Capacity model (normal/peak/burst/failure-domain), latency budget by hop, regression analysis, tested breakpoint, recovery-after-stress result, and cost/headroom tradeoff when cost is in scope. + +## Checks Before Moving On + +- `tail_metric`: target percentile, window, and journey are stated. +- `traffic_model`: peak, burst, concurrency, fanout, and tenant skew are modeled or marked unknown. +- `saturation_signals`: resource, queue, pool, and downstream saturation metrics are identified. +- `test_result`: load or regression test has scenario, stop criteria, result, and check path. +- `breakpoint_known`: the nonlinear failure point, or the reason it was not tested, is recorded. +- `headroom_check`: capacity includes peak, resource or dependency limits, and expected failure-domain conditions, with static capacity separated from emergency scaling. +- `recovery_after_stress`: recovery time and behavior after stress are measured or explicitly unknown. + +## Red Flags - Stop And Rework + +- Average latency is used as the primary user-experience metric. +- The plan scales replicas but ignores database, cache, queue, or downstream limits. +- Load tests stop at expected peak and never find the nonlinear point. +- Queue depth is monitored without queue age or drain rate. +- Cost cutting removes failover headroom without changing the SLO or accepting risk. +- A single fault-domain or partition recovery plan depends on scaling after the failure rather than preexisting headroom. + +## Common Mistakes + +| Mistake | Correction | +| --- | --- | +| Treating CPU as capacity | Include all saturation points: queues, locks, pools, IO, network, and dependencies. | +| Testing only steady load | Add bursts, soak, failover, cold cache, and dependency-slow scenarios. | +| Hiding overload in queues | Track age and drain rate; shed work before recovery becomes impossible. | +| Optimizing p50 | Optimize the percentile users and SLOs actually experience. | diff --git a/plugins/sirmarkz/staff-engineer-mode/specialists/platform-golden-paths.md b/plugins/sirmarkz/staff-engineer-mode/specialists/platform-golden-paths.md new file mode 100644 index 00000000..1c6f9a58 --- /dev/null +++ b/plugins/sirmarkz/staff-engineer-mode/specialists/platform-golden-paths.md @@ -0,0 +1,129 @@ +--- +name: platform-golden-paths +description: "Use when developer platforms, golden paths, service templates, scorecards, or paved-road defaults need design" +--- + +# Platform Engineering And Golden Paths + +## Iron Law + +``` +NO GOLDEN PATH WITHOUT RESPONSIBILITY, SECURITY, OBSERVABILITY, DEPLOYMENT, AND OPERATIONS DEFAULTS +``` + +If a template creates a service but not an operable service, it is not a golden path. + +> This skill can be used for a solo repo or a cross-project platform. Golden paths remove repeated setup; for a solo developer the same patterns live as repo templates and scripts, not as a platform product. Use `architecture-decisions` when the work is one service design rather than a reusable path. + +## Overview + +A good platform makes the safe path the easy path. + +**Core principle:** encode standards as reusable workflows, templates, scorecards, and self-service capabilities that projects actually use. + +## When To Use + +- The user asks about internal developer platforms, golden paths, paved roads, service catalogs, templates, scorecards, or standardized service creation. +- Multiple projects need repeatable service setup, deployment, responsibility, telemetry, security, or compliance records. +- The same operational or security gaps recur across services. +- A platform should reduce toil or make standards easier to satisfy. + +## When Not To Use + +- The work is one-off architecture for one service; use `architecture-decisions` instead. +- The request is only infrastructure policy mechanics; use `infrastructure-and-policy-as-code` instead. +- The request is compliance program management rather than engineering controls; use `engineering-control-evidence` instead only when in scope. +- The work is vendor selection or procurement; out of scope. + +## Info To Gather + +- Current work phase, next decision, what is known, and assumptions where details are missing. +- Target users, service types, common workflows, pain points, and current failure modes. +- Required defaults: responsibility, SLOs, telemetry, deployment, rollback, runbooks, security, secrets, cost tags, and recovery. +- Generated operational defaults: starter SLO, dashboard, alert/runbook, rollout, recovery, dependency inventory, and secure settings. +- Existing templates, catalogs, scorecards, delivery workflows, infrastructure modules, and exception process. +- Migration needs for existing services and adoption blockers. +- Platform responsibility, operating model, upgrade cadence, and feedback channels. + +## Workflow + +1. **Start from repeated pain.** Choose platform capabilities that remove recurring setup, safety, security, or operations work. +2. **Define the golden path.** Specify the service lifecycle from create, build, test, deploy, observe, operate, secure, recover, and retire. +3. **Bake in defaults.** Include responsibility, SLO hooks, telemetry, safe deploys, secret handling, access boundaries, runbooks, recovery, dependency inventory, and secure settings. Each generated default should be a usable artifact, not a placeholder. +4. **Make start-right templates.** Bootstrap repositories, delivery, infrastructure, observability, security, and policy defaults together so projects do not assemble safety by hand. +5. **Expose self-service with guardrails.** Make the path usable without bespoke platform intervention for normal cases while policy, security, cost, and operations controls stay automatic. +6. **Design scorecards.** Measure capability maturity across investment, adoption, controls, provisioning and management, interfaces, and feedback; use source records for meaningful capabilities, not vanity checkboxes. +7. **Handle exceptions.** Require user-confirmed reason, expiry, compensating control, and migration plan. +8. **Plan adoption.** Prioritize new services, high-risk services, and repeated incident classes; avoid big-bang migrations. +9. **Close feedback loops.** Use incidents, developer friction, and scorecard gaps to improve the platform. + +## Synthesized Default + +Build golden paths around capabilities rather than tools: service creation, build, test, release, telemetry, security, responsibility, recovery, controls, and records. Provide self-service with guardrails, start-right templates, and escape hatches, but make exceptions visible and temporary. + + + +## Phase Behavior + +- Ideation: identify risks, defaults, unknowns, options, and the next decision before code exists. +- Design: shape the target artifact, tradeoffs, checks, and details to gather. +- Development: guide sequencing, code boundaries, checks, and acceptance criteria. +- Testing: define release-blocking tests, evals, fixtures, and failure probes. +- Release: define rollout, observability, abort, rollback, and readiness details. +- Maintenance: define owners, drift checks, cleanup triggers, and refresh cadence. +- Existing artifact: use current code, docs, telemetry, incidents, or diffs as context for the next engineering decision; do not wait for a finished artifact before guiding design, build, release, or operation. +- Missing details: state assumptions and say what to check next instead of blocking lifecycle guidance. + +## Exceptions + +- Specialized services can deviate when golden-path assumptions do not fit, but the exception must preserve equivalent checks and operations standards. +- Early platform phases may cover a narrow service type first; state non-goals clearly. +- Strict scorecards should start advisory until platform capabilities make compliance achievable. +- Existing services may need incremental migration instead of template replacement. + +## Response Quality Bar + +- Lead with the platform capability map, golden-path design, scorecard, migration plan, or exception workflow requested. +- Cover lifecycle defaults, service responsibility, build/test/release, telemetry, security, recovery, record hooks, self-service, exceptions, adoption, and feedback before optional platform breadth. +- Make recommendations actionable with templates, required defaults, scorecard checks, migration batches, operating model, and exception expiry where relevant. +- Name the details to inspect, such as current service inventory, onboarding steps, platform friction, incident gaps, template outputs, scorecard results, adoption metrics, and user-visible failure records; do not state details you have not seen. +- Stay technology-agnostic by default: do not introduce provider, product, framework, database, protocol, or command names unless the user supplied them or explicitly requested tool-specific guidance. +- Stay inside platform engineering and golden paths. Route infrastructure policy, release engineering, or observability only when those specialist gaps block the platform decision. +- Be concise: avoid generic platform-product language and prefer compact capability maps, lifecycle defaults, and adoption tables. + +## Required Outputs + +- Platform capability map. +- Golden-path lifecycle and template requirements. +- Required service defaults and records hooks. +- Readiness-defaults matrix showing which operational defaults are created, usable, and checked by the golden path. +- Service catalog and responsibility model. +- Capability scorecard with meaningful checks, adoption feedback, and exception workflow. +- Migration/adoption plan. +- Feedback and operations model. + +## Checks Before Moving On + +- `template_defaults`: golden path includes responsibility, SLO/telemetry, deploy/rollback, runbook, security, and secrets defaults. +- `readiness_defaults`: generated defaults for SLO, dashboard, alert/runbook, rollout, recovery, dependency inventory, and secure settings are usable, not placeholders. +- `self_service`: normal workflow can be completed without bespoke platform work. +- `exception_model`: exceptions have user-confirmed reason, expiry, compensating control, and migration path. +- `adoption_plan`: target services, migration order, and operating model are stated. +- `feedback_loop`: incidents, friction, and scorecard gaps feed platform backlog. + +## Red Flags - Stop And Rework + +- Template creates code but no runbook, alerts, or rollout path. +- Platform mandates standards projects cannot satisfy with available tools. +- Scorecards reward fields existing instead of capabilities working. +- Exceptions are permanent. +- Golden path is a vendor product wrapper rather than an engineering workflow. + +## Common Mistakes + +| Mistake | Correction | +| --- | --- | +| Building a portal first | Start with repeatable workflows and defaults. | +| No escape hatch | Allow exceptions with user-confirmed reason, expiry, and equivalent controls. | +| Platform as ticket queue | Prefer self-service for normal paths. | +| Measuring adoption only | Measure operational and security outcomes too. | diff --git a/plugins/sirmarkz/staff-engineer-mode/specialists/privacy-and-data-lifecycle.md b/plugins/sirmarkz/staff-engineer-mode/specialists/privacy-and-data-lifecycle.md new file mode 100644 index 00000000..84f92d31 --- /dev/null +++ b/plugins/sirmarkz/staff-engineer-mode/specialists/privacy-and-data-lifecycle.md @@ -0,0 +1,136 @@ +--- +name: privacy-and-data-lifecycle +description: "Use when personal data needs minimization, classification, retention, erasure, export, or privacy-safe telemetry" +--- + +# Privacy Engineering And Data Lifecycle + +## Iron Law + +``` +NO PERSONAL DATA FLOW WITHOUT PURPOSE, CLASSIFICATION, MINIMIZATION, RETENTION, DELETION, AND AUDIT +``` + +If you cannot find and delete or justify every copy, you do not control the data lifecycle. + +## Overview + +Privacy controls fail when personal data is collected, copied, logged, retained, or derived without a lifecycle. + +**Core principle:** collect the least sensitive data that satisfies the purpose, propagate classification through every copy, and make retention, deletion, export, and audit behavior testable. + +## When To Use + +- The user asks about data minimization, retention, deletion, privacy-safe telemetry, sensitive-data lifecycle, anonymization, pseudonymization, or privacy engineering controls. +- A service copies personal or sensitive data into logs, traces, metrics, caches, search indexes, analytics, ML features, exports, backups, or support tools. +- A system needs engineering support for erasure, export, data subject requests, consent/purpose enforcement, or retention schedules. +- A design needs to prevent privacy regressions in release, observability, or data pipelines. + +## When Not To Use + +- The main issue is tenant boundary enforcement or noisy-neighbor isolation; use `tenant-isolation` instead. +- The main issue is authentication, authorization, secrets, or cryptography; use `identity-and-secrets` instead. +- The request is broad legal privacy statements, notice drafting, or regulator/auditor liaison; out of scope unless converted to concrete engineering controls. +- The work is only control mapping; use `engineering-control-evidence` instead. + +## Info To Gather + +- Current work phase, next decision, what is known, and assumptions where details are missing. +- Data inventory: fields, classifications, purpose, source, users, and downstream copies. +- Collection points, transformations, derived data, logs, telemetry, exports, backups, caches, and support views. +- Log, trace, and metric fields that may contain sensitive or stale data, with retention and cleanup path. +- Retention requirements, deletion triggers, legal holds if any, archival behavior, and backup expiration model. +- Data residency, cross-border transfer constraints, third-party processors, and subprocessors that store or receive personal data. +- Consent or purpose constraints that must be enforced by code, configuration, policy, or workflow. +- Access paths, activity logs, break-glass behavior, and privacy incident history. +- Data-subject request intake path, identity/account scope, response SLA, export format, erasure trigger, verification method, and exception handling. +- Validation approach for minimization, redaction, deletion, export correctness, and regression prevention. + +## Workflow + +1. **Inventory the flow.** Map personal data from collection through storage, processing, telemetry, derived data, export, support, backup, and deletion. +2. **Classify fields.** Mark sensitivity, purpose, allowed uses, residency, retention, and whether the field can be tokenized, redacted, aggregated, or omitted. +3. **Minimize collection.** Remove fields that are not needed; prefer derived, aggregated, tokenized, or on-device/local processing when it satisfies the purpose. +4. **Constrain use.** Enforce purpose, consent, and access constraints in code, data jobs, schemas, policy, or workflow checks. +5. **Control copies.** Apply privacy rules to logs, traces, metrics labels, crash reports, caches, search indexes, analytics, ML features, support tools, and third-party processors; remove stale telemetry fields and classify sensitive ones. +6. **Engineer deletion and retention.** Define retention classes, delete propagation, deletion markers for asynchronous cleanup, derived-copy repair, backup expiry, audit trail, holds/exclusions, and failure handling. +7. **Define the data-subject-rights workflow.** Specify how access, export, erasure, and portability requests are received, authenticated, scoped to stores and processors, completed within an SLA, verified for completeness, and closed with an audit record. +8. **Assess anonymization labels.** Do not call data anonymized unless reidentification risk has been assessed with an explicit method such as equivalence-class thresholds, diversity checks, noise-based aggregation, motivated-intruder assessment, or equivalent domain assessment; otherwise call it pseudonymized, aggregated, or tokenized. +9. **Verify export and erasure.** Test that subject, tenant, or account-scoped export/deletion finds expected copies, includes required third-party paths, uses a defined output format, and reports known exclusions. +10. **Prevent regressions.** Add schema checks, telemetry redaction tests, data-lineage alerts, and release checks for new sensitive fields. + +## Synthesized Default + +Use privacy-by-design as engineering controls: data inventory, classification, minimization, purpose enforcement, privacy-safe telemetry, retention/deletion automation, data-subject-rights workflow with SLA, export/erasure verification, and audit. Make user/control-plane deletion and retention behavior explicit across primary, derived, and archived copies. Keep legal interpretation outside the skill; make the agreed control enforceable and testable. + + + +## Phase Behavior + +- Ideation: identify risks, defaults, unknowns, options, and the next decision before code exists. +- Design: shape the target artifact, tradeoffs, checks, and details to gather. +- Development: guide sequencing, code boundaries, checks, and acceptance criteria. +- Testing: define release-blocking tests, evals, fixtures, and failure probes. +- Release: define rollout, observability, abort, rollback, and readiness details. +- Maintenance: define owners, drift checks, cleanup triggers, and refresh cadence. +- Existing artifact: use current code, docs, telemetry, incidents, or diffs as context for the next engineering decision; do not wait for a finished artifact before guiding design, build, release, or operation. +- Missing details: state assumptions and say what to check next instead of blocking lifecycle guidance. + +## Exceptions + +- Legal hold, fraud, security investigation, or financial record retention may override normal deletion; record scope, and expiry. +- Some backup media cannot delete individual records immediately; require bounded expiry, restore-time deletion, and documented risk. +- Aggregated or anonymized data can have different retention only when reidentification risk is assessed. +- Low-risk internal telemetry may use lighter controls if it contains no personal or sensitive data by classification. + +## Response Quality Bar + +- Lead with the data-flow finding, privacy control design, retention/deletion plan, data-subject-rights workflow, or blocker list requested. +- Cover inventory, classification, minimization, purpose/access enforcement, telemetry/support controls, retention/deletion propagation, and verification before optional privacy breadth. +- For access, erasure, export, or portability requests, state the request workflow, responsible control points, SLA, store coverage, exception list, verification method, and closure notes event. +- Make recommendations actionable with field-level decisions, control points, test checks, failure handling, and retention or exception expiry where relevant. +- Name the details to inspect, such as field inventories, data stores, logs, caches, derived copies, consent/purpose rules, deletion traces, export tests, and backup behavior; do not state details you have not seen. +- Stay technology-agnostic by default: do not introduce provider, product, framework, database, protocol, or command names unless the user supplied them or explicitly requested tool-specific guidance. +- Stay inside engineering controls for data lifecycle. Leave legal interpretation out unless the user supplies a requirement to implement. +- Be concise: avoid generic privacy principles and prefer compact field inventories, flow maps, and verification plans. + +## Required Outputs + +- Personal-data flow inventory. +- Field classification and minimization plan. +- Purpose/consent/access enforcement plan. +- Privacy-safe telemetry and support-tool controls. +- Telemetry data review table for sensitive or stale log, trace, and metric fields. +- Retention, deletion, backup, and derived-data propagation design. +- Data-subject-rights workflow for access, erasure, export, and portability with intake, scope, SLA, verification, exclusions, and audit closure. +- Anonymization or pseudonymization risk assessment when those labels are used. +- Export/erasure verification plan with store coverage, third-party coverage, output format, exclusion list, and completeness checks. +- Regression checks and activity logs. + +## Checks Before Moving On + +- `data_inventory`: personal and sensitive fields are mapped through primary and derived copies. +- `minimization_check`: every collected field has purpose, and keep/remove/tokenize decision. +- `copy_control`: logs, metrics, traces, caches, exports, support tools, and analytics have privacy handling. +- `telemetry_data_review`: log, trace, and metric fields are reviewed for sensitive data, stale fields, retention, and minimization. +- `deletion_path`: retention, deletion trigger, propagation, backup behavior, and failure handling are defined. +- `dsr_workflow`: access, erasure, export, or portability requests have intake, SLA, scope, verification, exclusions, and closure notes. +- `anonymization_check`: anonymized or pseudonymized outputs state reidentification-risk method and residual limits. +- `verification_plan`: export, erasure, redaction, or minimization controls have tests or review results. + +## Red Flags - Stop And Rework + +- Sensitive fields appear in logs or metric labels because they are useful for debugging. +- Retention is "forever" because no deletion trigger, expiry, or verification path exists. +- Delete requests remove primary rows but leave caches, search indexes, analytics, or ML features. +- Consent or purpose is documented but not enforced by the system. +- Data is labeled anonymized without reidentification risk review. + +## Common Mistakes + +| Mistake | Correction | +| --- | --- | +| Treating privacy as legal text | Convert privacy decisions into code, config, checks, and records. | +| Mapping only primary storage | Include telemetry, derived data, backups, exports, and support tools. | +| Redacting after collection | Minimize or tokenize before broad propagation. | +| Trusting manual deletion | Automate propagation and verify with checks. | diff --git a/plugins/sirmarkz/staff-engineer-mode/specialists/production-readiness-review.md b/plugins/sirmarkz/staff-engineer-mode/specialists/production-readiness-review.md new file mode 100644 index 00000000..b6958cb7 --- /dev/null +++ b/plugins/sirmarkz/staff-engineer-mode/specialists/production-readiness-review.md @@ -0,0 +1,140 @@ +--- +name: production-readiness-review +description: "Use when launch, migration, tier change, major traffic shift, or release needs go/no-go readiness details" +--- + +# Production Readiness Decision + +## Iron Law + +``` +NO LAUNCH READINESS DECISION WITHOUT CHECKED DETAILS OR A DATED EXCEPTION +``` + +Unknown is not green. Missing details are a blocker, a recorded follow-up with check path and due date, or explicit user risk acceptance. + +## Overview + +Produces a tier-classified launch posture with an readiness matrix, a blocker list, and an exception register with expiry dates. Stops launches that confuse intentions for checked facts. Unknown is not green. + +**Core principle:** before launch or major traffic shift, show — with artifacts, not intentions — that responsibility, reliability, observability, safe change, security, capacity, recovery, and incident paths are good enough for the declared tier. + +## When To Use + +- The user asks whether a service, feature, migration, tier upgrade, major traffic shift, or system is ready for production. +- A launch touches multiple engineering surfaces and needs one readiness posture. +- The user asks for production readiness across responsibility, SLOs, rollout, security, capacity, recovery, and operations. +- You need blockers, exceptions, and follow-up routes before go/no-go. + +## When Not To Use + +- A small code change has no production responsibility, operational, security, or reliability impact. +- The user needs one narrow artifact, such as only an SLO table (use `slo-and-error-budgets` instead) or only a threat model (use `secure-sdlc-and-threat-modeling` instead). +- The request is model-serving promotion details, eval thresholds, skew, drift, monitoring, or rollback; use `ml-reliability-and-evaluation` instead. +- A live incident is underway; route to `incident-response-and-postmortems` first. +- The question is business confirmation, marketing launch, legal release decision, or procurement; out of scope. + +## Info To Gather + +- Launch scope, tier, customer/user impact, production dependencies, and user decision point. +- Architecture artifact: component diagram or textual component map, request/data flow, upstream and downstream dependencies, and fault-domain boundaries. +- Operability: who can run the launch, fallback path, diagnostics, incident path, and user decision point. +- SLOs/error budgets, dashboards, alerts, runbooks, and incident communication path. +- Availability posture: location independence, partition survivability, static failover capacity, and recovery drill results. +- Rollout plan, rollback path, canary metrics, migration plan, and feature/config lifecycle. +- Security posture: threat model, data classification, access controls, secrets, supply-chain controls, and vulnerability status. +- Capacity, load-test results, overload behavior, failover target, and dependency quotas. +- Backup/restore, DR results, data migration validation, and destructive-change safeguards. +- Freshness of readiness details: last checked dashboards, alerts, runbooks, rollout checks, recovery checks, load tests, and open drift since the last readiness decision. +- Open risks, exceptions, compensating controls, expiry dates, and follow-up actions. + +## Workflow + +1. **Classify launch tier and scope.** State what is launching, who is affected, and which standard applies. +2. **Apply the default tier rubric.** Tier 1 means externally committed, customer-critical, sensitive-data, stateful, or safety-critical impact; Tier 2 means user-visible degradation with bounded blast radius; Tier 3 means internal or shared-service impact; Tier 4 means isolated prototype or experiment. +3. **Collect artifacts.** Gather readiness details from specialist domains instead of rewriting all domain work inside PRR; mark stale details and drift since the last relevant readiness decision. +4. **Check architecture shape.** Identify the component diagram or textual map, production dependencies, and fault-domain map for the launch path; if these are missing for a customer-impacting launch, mark the architecture gap explicitly. +5. **Mark each domain.** Use Pass, Blocker, Exception, Follow-up, or Not Applicable. A gap is a Blocker when it can violate the tier's user, data, security, recovery, or rollback requirement before launch; it is a Follow-up only when launch risk remains bounded and the follow-up action, check path, and due date are explicit. +6. **Check runtime readiness.** Require SLOs, journey health model, telemetry, alerts, runbooks, fallback path, diagnostics, and incident path for customer-impacting launches. +7. **Check change readiness.** Require rollout, rollback, canary, compatibility, migration, and cleanup details. +8. **Check resilience and recovery.** Require location or partition independence, static failover capacity, overload behavior, failover targets, recovery drills, and restore test results when relevant. +9. **Check security and integrity.** Require threat model, access controls, secret handling, build integrity, and unresolved vulnerability posture. +10. **Check cross-pillar tradeoffs.** Identify reliability, security, cost, operational, and performance decisions that improve one quality while weakening another. +11. **Summarize advisory posture.** Produce blockers, exceptions, and follow-up routes. The skill identifies objective blockers and readiness gaps; the user decides whether to proceed. + +## Synthesized Default + +Use PRR as a cross-domain readiness decision for launches and major changes. It should inspect available details, identify missing artifacts, expose cross-pillar tradeoffs, and route only the highest-risk gaps. It should not auto-load every specialist skill. + + + +## Phase Behavior + +- Ideation: identify risks, defaults, unknowns, options, and the next decision before code exists. +- Design: shape the target artifact, tradeoffs, checks, and details to gather. +- Development: guide sequencing, code boundaries, checks, and acceptance criteria. +- Testing: define release-blocking tests, evals, fixtures, and failure probes. +- Release: define rollout, observability, abort, rollback, and readiness details. +- Maintenance: define owners, drift checks, cleanup triggers, and refresh cadence. +- Existing artifact: use current code, docs, telemetry, incidents, or diffs as context for the next engineering decision; do not wait for a finished artifact before guiding design, build, release, or operation. +- Missing details: state assumptions and say what to check next instead of blocking lifecycle guidance. + +## Exceptions + +- Internal prototypes may use advisory PRR if they cannot affect customers, production data, or shared infrastructure. +- Tier-1, regulated, stateful, or externally committed systems require stricter checks and dated risk acceptance. +- Emergency launches can proceed with documented risk when delaying is worse, but follow-up checks and post-launch checks are mandatory. +- A domain can be Not Applicable only with the disqualifying property, supporting details, and reason, not by omission. + +## Response Quality Bar + +- Lead with the launch posture, blocker list, exception register, or readiness decision boundary requested. +- Cover architecture, responsibility, runtime readiness, safe change, recovery, security, and capacity details before optional PRR breadth. +- Include an architecture row for customer-impacting launches: component diagram or textual map, dependencies, and fault-domain map. +- Make recommendations actionable with missing details, checks, due dates, stop criteria, user risk acceptance, and exception expiry where relevant. +- Name the details to inspect, such as dashboards, SLOs, rollout plans, runbooks, load tests, restore checks, threat models, and vulnerability status; do not state details you have not seen. +- Stay technology-agnostic by default: do not introduce provider, product, framework, database, protocol, or command names unless the user supplied them or explicitly requested tool-specific guidance. +- Stay inside launch readiness. Route only the highest-risk specialist follow-ups and cap them at two unless the user asks for a full readiness pack. +- Be concise: avoid generic checklist prose and prefer compact readiness matrices, blocker tables, and exception registers. + +## Required Outputs + +- PRR readiness matrix by domain and status. +- Freshness and drift notes for readiness details that can go stale, such as dashboards, runbooks, rollout checks, recovery checks, and load tests. +- Architecture entry with component diagram or textual map, production dependencies, and fault-domain map. +- Availability row covering fault-domain independence, static capacity under loss, recovery mechanism, and drill results. +- Launch blocker list with required details, file/path or artifact reference, and due date. +- Exception register with user risk acceptance, expiry, compensating control, and refresh trigger. +- Advisory launch posture and risk summary. +- Specialist follow-up routes, capped and prioritized. +- Tier classification and advisory boundaries: what the skill can mark as blocker, exception, follow-up, or not applicable versus who decides launch. + +## Checks Before Moving On + +- `tier_check`: tier classification states impact radius, data sensitivity, statefulness, external commitment, and user decision point. +- `architecture_check`: architecture details include component diagram or textual component map, production dependencies, and fault-domain map for the affected launch path. +- `operability_check`: every production component has fallback path, diagnostics, tier, and user decision point. +- `runtime_check`: customer-impacting paths have SLOs, health states, telemetry, alerts, runbooks, and incident path. +- `change_check`: rollout, rollback, canary metrics, compatibility, and cleanup are documented. +- `freshness_check`: readiness details that can drift have a last-checked signal, current source, or explicit follow-up. +- `availability_check`: customer-impacting systems have location/partition independence, static failed-domain capacity, recovery path, and validation results or an explicit exception. +- `recovery_check`: stateful or tier-critical systems have restore/DR results or an explicit exception. +- `exception_check`: every accepted risk has explicit user acceptance, expiry, compensating control, and refresh trigger. + +## Red Flags - Stop And Rework + +- The checklist is green but has no links, commands, artifact references, or explicit user decision point. +- PRR gives go/no-go authority to the agent instead of presenting details for the user decision. +- Exceptions never expire. +- The launch can roll forward but cannot roll back or stop safely. +- "Not applicable" is used to avoid security, recovery, or incident checks without rationale. +- A missing blocker is downgraded to follow-up without stating why the launch risk remains bounded. + +## Common Mistakes + +| Mistake | Correction | +| --- | --- | +| Treating PRR as a mega-skill | Aggregate readiness details and route gaps to specialists. | +| Counting intentions as facts | Require artifacts, commands, dashboards, runbooks, or dated exceptions. | +| Making all risks equal | Separate blockers from accepted exceptions and follow-ups. | +| Forgetting responsibility | Every blocker and exception needs supporting details, expiry, and user decision point. | diff --git a/plugins/sirmarkz/staff-engineer-mode/specialists/progressive-delivery.md b/plugins/sirmarkz/staff-engineer-mode/specialists/progressive-delivery.md new file mode 100644 index 00000000..d990a72e --- /dev/null +++ b/plugins/sirmarkz/staff-engineer-mode/specialists/progressive-delivery.md @@ -0,0 +1,138 @@ +--- +name: progressive-delivery +description: "Use when config, schema, data, or non-API client changes need staged rollout, canary metrics, or rollback plans" +--- + +# Progressive Delivery And Safe Change + +## Iron Law + +``` +NO PRODUCTION CHANGE WITHOUT A BLAST RADIUS, STOP CRITERIA, AND RECOVERY PATH +``` + +If the rollout cannot be stopped or reversed when rollout signals degrade, it is not safe delivery. + +## Overview + +Produces a staged rollout plan with named blast radius per stage, predeclared canary metrics with baseline and observation windows, stop and rollback criteria, and cleanup responsibility for every temporary flag or compatibility path. Refuses rollouts whose rollback only reverts code while config, schema, data, or clients stay forward. + +**Core principle:** treat code, configuration, flags, schemas, data, infrastructure, and model artifacts as production changes with the same blast-radius discipline. + +## When To Use + +- The user asks how to roll out, rollback, canary, phase, stage, check, migrate, or release a change. +- A change involves configuration, feature flags, schema/data migration, dependency update, model change, infrastructure change, or client-visible behavior. +- The user asks how to reduce production risk from deployments or release trains. +- PRR or launch readiness needs rollout, rollback, and canary details. + +## When Not To Use + +- A live incident needs immediate command and mitigation; route to `incident-response-and-postmortems` first. +- The question is only code review or merge checks; use `agent-pr-review` for a concrete diff or `testing-and-quality-gates` for blocking checks. +- The question is build systems, release branches, packaging, or reproducible artifacts; use `release-build-reproducibility` instead. +- The main risk is database lock/backfill execution; use `database-operations` instead for that detail and use this skill for rollout sequencing. +- The request is product launch messaging or marketing; out of scope. + +## Info To Gather + +- Current work phase, next decision, what is known, and assumptions where details are missing. +- Change type, responsible change path, affected users, blast radius, tier, and reversibility. +- Artifact identity and promotion path from build to environments. +- Production-like preflight stage coverage and whether exposure control is separate from deployment. +- Rollout unit: instance, ring, cohort, partition, deployment unit, location, tenant, percentage, device group, or internal-only group. +- Canary metrics: SLO symptoms, errors, latency, saturation, correctness, business invariants, and guardrail signals. +- Rollback or forward-fix path for code, config, flags, schema, data, and clients. +- Feature-flag lifecycle, config validation, migration steps, cleanup responsibility, and expiry. +- Observability markers, dashboards, alerts, incident path, and communication expectations. + +## Workflow + +1. **Classify the change.** Separate code, config, flag, schema, data, infrastructure, dependency, model, and client components; each can fail differently. +2. **Bound the blast radius.** Pick the smallest rollout unit that still gives signal. State who or what can be affected at each stage, and avoid stages that can damage multiple independent locations, partitions, or deployment units at once. +3. **Promote one artifact.** Build once and promote the same artifact or immutable change set through stages. +4. **Define compatibility.** Ensure old and new versions can coexist across clients, services, data, and messages during rollout. +5. **Stage stateful changes.** Keep reader/writer compatibility across at least one-version skew; use expand/contract, dual-read/dual-write, delayed cleanup, and explicit schema/data ordering when state is involved. +6. **Choose canary checks.** Select metrics before release. Include user-visible symptoms and correctness, not only internal health. Scope each metric to the canary slice itself — fleet-aggregate metrics dilute the signal into the size of the unchanged deployment, so canary regression vanishes long before it crosses a fleet-wide threshold. Each check needs a baseline window, minimum observation window, bake time, and enough exposed traffic or an alternate signal such as synthetic probes, extended bake time, or manual verification. +7. **Check each exposure step.** Exercise the changed path in a production-like preflight stage, then start with a tiny production slice when possible and move through rings, cohorts, partitions, stamps, deployment units, or locations only after health signals say the previous step is safe. Within an ordinary rolling deployment, keep at least two-thirds of serving capacity healthy at all times unless an explicit capacity model shows a different threshold is safe; faster simultaneous replacement narrows surge headroom and risks turning the deployment itself into the saturation event. +8. **Set stop and rollback rules.** Define thresholds, who can halt, and how rollback works. Stop signals should fire before the tighter internal SLO alert thresholds are crossed. Pre-classify rollback safety per change: it is safe when the change is stateless, flag-gated, purely additive, or recently deployed with minimal state divergence; it is dangerous when a schema migration has run, a data format changed and new data is being written, external clients depend on the new contract, a stateful workflow is in flight, or a cache holds data in the new format. Choose forward-fix when rollback would cause more damage than the current impact, the fix is small and quickly deployable, or impact is confined to an isolatable subset. If user impact is active, route incident command to `incident-response-and-postmortems` while keeping rollback mechanics traceable here. +9. **Handle forward-fix-only surfaces.** If rollback is structurally impossible, require a server-side kill switch or disable path, staged adoption metric, hotfix lane, and explicit user confirmation before first exposure. +10. **Handle non-code changes as first class.** Validate config, stage flags, throttle migrations, and delay destructive cleanup. +11. **Keep emergency flow familiar.** Hotfixes may move faster, but should use the same artifact identity, health checks, and traceable branch/change workflow where practical. +12. **Close the loop.** Record rollout results, remove temporary flags/paths, and update standards if the rollout found a new class of risk. + +## Synthesized Default + +Use build-once promotion, progressive exposure, predeclared health and canary metrics, automated or explicit stop criteria, reversible changes, and cleanup responsibility. Prefer small production slices, bake time, and independent fault-domain waves over parallel broad exposure. Prefer compatibility and expand/contract patterns over big-bang cutovers. Treat deploy, exposure, and customer-visible release as separate control points. + + + +## Phase Behavior + +- Ideation: identify risks, defaults, unknowns, options, and the next decision before code exists. +- Design: shape the target artifact, tradeoffs, checks, and details to gather. +- Development: guide sequencing, code boundaries, checks, and acceptance criteria. +- Testing: define release-blocking tests, evals, fixtures, and failure probes. +- Release: define rollout, observability, abort, rollback, and readiness details. +- Maintenance: define owners, drift checks, cleanup triggers, and refresh cadence. +- Existing artifact: use current code, docs, telemetry, incidents, or diffs as context for the next engineering decision; do not wait for a finished artifact before guiding design, build, release, or operation. +- Missing details: state assumptions and say what to check next instead of blocking lifecycle guidance. + +## Exceptions + +- Emergency fixes may use a narrower or faster rollout when waiting is riskier than release, but stop criteria and rollback checks still apply. +- Some destructive data changes cannot be rolled back; they require backup/restore test results, delayed cleanup, and forward-fix criteria. +- Low-risk internal changes may use lighter checks if blast radius and user risk acceptance are explicit. +- Client releases with slow adoption may require forward-fix and kill-switch strategy rather than true rollback. +- Temporary experiment flags should expire within about 90 days by default; long-lived operational kill switches need a renewal cadence and removal or renewal decision. + +## Response Quality Bar + +- Lead with the rollout plan, halt criteria, rollback path, or exposure decision requested. +- Cover blast radius, artifact identity, canary metrics, compatibility, feature/config lifecycle, migration safety, and cleanup before optional delivery topics. +- Make recommendations actionable with stage thresholds, windows, stop criteria, rollback or forward-fix actions, and cleanup expiry where relevant. +- Name the details to inspect, such as artifact IDs, deploy markers, canary baselines, SLO/error signals, migration checks, rollback checks, and flag inventory; do not state details you have not seen. +- Stay technology-agnostic by default: do not introduce provider, product, framework, database, protocol, or command names unless the user supplied them or explicitly requested tool-specific guidance. +- Stay inside progressive exposure and safe change. Route build reproducibility, API compatibility, or data migration depth only when they materially block rollout safety. +- Be concise: avoid generic CD background and prefer compact rollout, metric, and rollback tables. + +## Required Outputs + +- Rollout plan with stages, blast radius, responsible change path, and schedule. +- Preflight and first-slice checks for the changed path, with the rollout unit and stop signal for each step. +- Canary metric set with thresholds, baseline window, observation window, minimum signal, and expected behavior. +- Stop, rollback, mitigation, and forward-fix criteria. +- Compatibility plan for old/new code, clients, data, messages, config, and stateful reader/writer skew. +- Feature flag/config lifecycle plan with expiry and removal condition. +- Migration and cleanup plan for temporary paths or data structures. +- Verification commands or check links for each check. + +## Checks Before Moving On + +- `blast_radius`: every rollout stage names affected users/systems and maximum impact. +- `artifact_identity`: the release identifies the artifact/change set and promotion path. +- `canary_criteria`: canary metrics, thresholds, windows, and stop rules are defined before rollout. +- `fault_domain_sequence`: customer-impacting exposure moves through bounded instance, cohort, partition, deployment-unit, or location waves rather than parallel broad deployment. +- `preflight_parity`: the changed path is exercised in a production-like preflight stage or the gap is explicit. +- `small_slice_first`: production exposure starts with a small slice unless the risk decision explains why not. +- `auto_stop_signal`: stop or rollback signals are tied to user-visible health and stricter internal thresholds. +- `rollback_path`: rollback or forward-fix path is pre-classified per change type, tested, rehearsed, or explicitly exempted with user confirmation. +- `cleanup_responsibility`: temporary flags, configs, compatibility paths, and migration leftovers have cleanup action and expiry. + +## Red Flags - Stop And Rework + +- Rollback means "revert the PR" while config, data, schema, or clients are not reversible. +- Canary metrics are picked after the rollout begins. +- One rollout stage can affect multiple independent fault domains before prior stages bake. +- Feature flags have no removal plan. +- Configuration changes bypass validation or staged rollout. +- Destructive cleanup happens in the same step as first exposure. + +## Common Mistakes + +| Mistake | Correction | +| --- | --- | +| Treating deploy and release as the same thing | Deploy safely first; expose behavior progressively. | +| Only measuring service health | Include user symptoms and correctness invariants. | +| Ignoring config | Validate, stage, and roll back config as carefully as code. | +| Forgetting cleanup | Track temporary flags and compatibility paths to removal. | diff --git a/plugins/sirmarkz/staff-engineer-mode/specialists/release-build-reproducibility.md b/plugins/sirmarkz/staff-engineer-mode/specialists/release-build-reproducibility.md new file mode 100644 index 00000000..a33a55de --- /dev/null +++ b/plugins/sirmarkz/staff-engineer-mode/specialists/release-build-reproducibility.md @@ -0,0 +1,129 @@ +--- +name: release-build-reproducibility +description: "Use when cutting releases, release trains, release candidates, builds, packaging, artifact identity, or promotion" +--- + +# Release Engineering And Build Reproducibility + +## Iron Law + +``` +NO RELEASE WITHOUT PINNED INPUTS, REPRODUCIBLE BUILD, IMMUTABLE ARTIFACT, AND TRACEABLE PROMOTION +``` + +If you cannot tell exactly what was built, how it was built, and where it was promoted, the release is not reliable. + +## Overview + +Release engineering turns source changes into trustworthy artifacts. + +**Core principle:** build from pinned inputs in a controlled environment, identify the artifact precisely, and promote that artifact through validation and release. + +## When To Use + +- The user asks about build systems, release engineering, release trains, release branches, release candidates, packaging, versioning, or artifact promotion. +- The user is making a release and needs release-cut steps, branch or candidate handling, versioning, packaging, artifact identity, or promotion checks. +- Builds are slow, flaky, non-hermetic, non-reproducible, cache-sensitive, or dependent on local developer machines. +- A release process needs build-once promotion, release cut criteria, release branch policy, or artifact identity. +- You need to separate build, deploy, and release responsibilities. + +## When Not To Use + +- The main topic is rollout stages, canaries, feature flags, rollback, or production exposure; use `progressive-delivery` instead. +- The main topic is artifact signing, provenance maturity, dependency inventory, builder trust, or deploy admission; use `software-supply-chain-security` instead. +- The main topic is generic code review latency or developer workflow policy, with no build or release artifact risk. +- The main topic is an actively vulnerable deployed artifact; use `vulnerability-management` instead. + +## Info To Gather + +- Current work phase, next decision, what is known, and assumptions where details are missing. +- Source revision, branch/release-line model, release cadence, and supported versions. +- Build graph, test graph, generated code, packaging steps, and artifact outputs. +- Pinned dependencies, lockfiles, toolchains, build images, environment variables, and network access. +- Cache strategy, cache keys, invalidation rules, remote/local differences, and flaky build examples. +- Release checks: tests, static checks, compatibility checks, security checks, and confirmation requirements. +- Artifact identity, metadata, storage, promotion path, deployment consumers, and rollback path. +- Deploy or scale dependencies on live artifact sources, mirrored or cached artifacts, and behavior when artifact sources are unavailable. + +## Workflow + +1. **Separate concerns.** Distinguish developer build feedback, CI validation, artifact creation, deployment, and user-facing release. +2. **Pin every input.** Record source revision, dependencies, toolchains, build image, generators, and configuration needed to recreate the artifact. +3. **Make builds hermetic.** Remove undeclared local files, ambient credentials, network fetches, clock-sensitive output, and machine-specific behavior. +4. **Stabilize the graph.** Define build/test targets, cache keys, generated-output responsibility, and invalidation rules so cache hits cannot hide missing dependencies. +5. **Build once, promote many.** Create an immutable artifact once and move the same artifact through validation, staging, and production; deploy and scale paths should use pinned, available artifacts rather than live resolution during an emergency where feasible. +6. **Define release lines.** Choose trunk release, release branch, train, or candidate flow based on support window and rollback needs. +7. **Keep main recoverable.** Prefer short-lived topic branches, protected main, and release branches with explicit cherry-pick/backport policy so hotfixes do not disappear from the next release. +8. **Check releases deliberately.** Keep checks fast and signal-rich; quarantine flaky checks, but do not let flakes silently weaken the release signal. +9. **Record traceability.** Link artifact, source, build logs, checks, release decision, deployment, and rollback target. + +## Synthesized Default + +Use hermetic, reproducible, build-once promotion with pinned inputs, explicit artifact identity, fast automated checks, and traceable release metadata. Prefer trunk-compatible releases with short-lived topic branches and maintained release branches only when support windows require maintained release lines. + + + +## Phase Behavior + +- Ideation: identify risks, defaults, unknowns, options, and the next decision before code exists. +- Design: shape the target artifact, tradeoffs, checks, and details to gather. +- Development: guide sequencing, code boundaries, checks, and acceptance criteria. +- Testing: define release-blocking tests, evals, fixtures, and failure probes. +- Release: define rollout, observability, abort, rollback, and readiness details. +- Maintenance: define owners, drift checks, cleanup triggers, and refresh cadence. +- Existing artifact: use current code, docs, telemetry, incidents, or diffs as context for the next engineering decision; do not wait for a finished artifact before guiding design, build, release, or operation. +- Missing details: state assumptions and say what to check next instead of blocking lifecycle guidance. + +## Exceptions + +- Emergency fixes may use a shortened check path, but artifact identity, pinned inputs, and rollback target still apply. +- Long-lived support branches are appropriate when customers, platforms, or compliance commitments require maintained versions. +- Some generated artifacts cannot be byte-identical across platforms; require semantic reproducibility and record the allowed nondeterminism. +- Experimental internal tools may use lighter packaging if they do not create production artifacts. + +## Response Quality Bar + +- Lead with the release pipeline decision, reproducibility gap, flaky-build diagnosis, or release-cut plan requested. +- Cover pinned inputs, hermeticity, artifact identity, cache safety, release checks, promotion, and rollback traceability before optional release topics. +- Make recommendations actionable with build metadata, validation commands, checks, stop criteria, and rollback artifact references where relevant. +- Name the details to inspect, such as source revision, lockfiles, toolchain versions, build images, cache keys, build logs, artifact metadata, and promotion records; do not state details you have not seen. +- Stay technology-agnostic by default: do not introduce provider, product, framework, database, protocol, or command names unless the user supplied them or explicitly requested tool-specific guidance. +- Stay inside build and release engineering. Route rollout/canary behavior or supply-chain signing only when those are the central unresolved risk. +- Be concise: avoid generic release-process background and prefer compact pipeline maps, hermeticity checklists, and traceability tables. + +## Required Outputs + +- Build and release pipeline map. +- Pinned-input and hermeticity checklist. +- Artifact identity and metadata standard. +- Deploy/scale artifact dependency table with source, pinning, availability, mirror/cache, and unavailable-source behavior. +- Release branch/train/candidate policy. +- Build cache and invalidation policy. +- Release check list with required versus advisory checks. +- Promotion and rollback traceability plan. + +## Checks Before Moving On + +- `input_pinning`: source, dependencies, toolchains, generated inputs, and build environment are pinned or explicitly exempted. +- `hermeticity_check`: build does not depend on undeclared local files, ambient network, machine state, or unscoped credentials. +- `artifact_identity`: artifact has immutable identifier, source revision, build metadata, and storage location. +- `artifact_availability`: deploy and scale paths use pinned artifacts with availability behavior defined for missing artifact sources. +- `cache_safety`: cache keys and invalidation rules show stale output cannot satisfy changed inputs. +- `release_record`: promotion and rollback path link artifact, checks, deployment, and verification results. + +## Red Flags - Stop And Rework + +- Release artifacts are rebuilt separately for each environment. +- A build passes only on one developer machine or one CI worker. +- Cache misses are slow, but cache hits are not trusted. +- Release branches exist indefinitely with no support window, or merge policy. +- Rollback target is "whatever was previously deployed" with no artifact identity. + +## Common Mistakes + +| Mistake | Correction | +| --- | --- | +| Treating deploy as release | Build and deploy artifacts separately from user exposure. | +| Chasing speed before determinism | Make the build correct and reproducible, then optimize graph and cache. | +| Ignoring generated code | Treat generators and generated outputs as declared build inputs. | +| Letting flakes erode checks | Quarantine, assign, and fix flakes with expiry. | diff --git a/plugins/sirmarkz/staff-engineer-mode/specialists/resilience-experiments.md b/plugins/sirmarkz/staff-engineer-mode/specialists/resilience-experiments.md new file mode 100644 index 00000000..92fe874f --- /dev/null +++ b/plugins/sirmarkz/staff-engineer-mode/specialists/resilience-experiments.md @@ -0,0 +1,133 @@ +--- +name: resilience-experiments +description: "Use when chaos tests, game days, failover drills, or fault injection need hypothesis, blast radius, and abort criteria" +--- + +# Resilience Experiments And Chaos Engineering + +## Iron Law + +``` +NO FAILURE EXPERIMENT WITHOUT HYPOTHESIS, BLAST RADIUS, ABORT CRITERIA, TELEMETRY, AND LEARNING LOOP +``` + +Breaking things without a learning objective is not engineering. + +## Overview + +Resilience experiments test whether the system behaves the way the design says it should behave. + +**Core principle:** run controlled experiments with a hypothesis, bounded blast radius, observable steady state, abort criteria, and follow-up fixes. + +## When To Use + +- The user asks for a chaos experiment, game day, failover drill, disaster role play, fault injection, or resilience test plan. +- You want test results that confirm retry, failover, overload, backup, or dependency-failure behavior works. +- You need to exercise location, partition, deployment-unit, traffic-shift, startup, or recovery behavior before relying on it. +- A launch readiness decision needs controlled failure validation. +- Incident follow-up requires proving that a class of failure is now handled. + +## When Not To Use + +- The main deliverable is fault-domain topology, static stability, or multi-location design; use `high-availability-design` instead. +- The request is proving failover capacity, topology, or availability assumptions rather than designing the experiment itself; use `high-availability-design` instead. +- The main deliverable is backup restore testing or RTO/RPO validation; use `backup-and-recovery` instead unless broader experiments are central. +- The main deliverable is timeout, retry, queue, or overload policy; use `dependency-resilience` instead. +- The work is only unit/integration testing without runtime failure injection. + +## Info To Gather + +- Current work phase, next decision, what is known, and assumptions where details are missing. +- System tier, SLOs, critical journeys, known failure modes, and previous incident classes. +- Existing fault-domain map, dependency matrix, capacity model, and recovery runbooks. +- Previous tests for dependency unavailability, dependency slowness, cache loss, fault-domain loss, and alert/runbook response. +- Steady-state signals: availability, latency, correctness, freshness, saturation, queue age, and user-impact indicators. +- Experiment target, injected fault, blast radius, duration, traffic scope, customer exposure, and abort criteria. +- Production cadence or trigger for recurring drills, based on tier and change rate. +- Participants, on-call coverage, communication channel, user decision point, and rollback/fallback actions. +- What to record, expected outcome, safety constraints, and follow-up tracking path. + +## Workflow + +1. **State the hypothesis.** Use the form: "If X fails, the system will continue Y within Z because controls A and B work." +2. **Define steady state.** Pick user-visible and causal signals before injecting failure. +3. **Bound the blast radius.** Start with shift-left simulation or staging when needed, then a small partition, tenant, shard, deployment unit, or traffic slice. +4. **Set abort criteria.** Decide in advance which SLO burn, error, latency, saturation, data, or operator signal stops the experiment. +5. **Prepare responders.** Confirm on-call, runbooks, rollback, communication channel, and user decision point. +6. **Inject one failure.** Change one variable at a time unless the explicit goal is compound-failure validation; cover the highest-risk missing modes across dependency down, dependency slow, cache loss, fault-domain loss, and response-path failure. +7. **Observe and decide.** Compare actual behavior to hypothesis, abort on criteria, and record results while the system is still fresh. +8. **Set recurrence deliberately.** For tier-critical recovery mechanisms, define when to repeat the drill after topology, traffic, dependency, or runbook changes. +9. **Close the loop.** File fixes, update runbooks, add regression checks, and rerun only after material changes. + +## Synthesized Default + +Use hypothesis-driven experiments that begin small, verify user-visible steady state, and expand only after results support the previous scope. Treat shift-left experiments, shift-right production drills, and game days as engineering validation, not ritual. + + + +## Phase Behavior + +- Ideation: identify risks, defaults, unknowns, options, and the next decision before code exists. +- Design: shape the target artifact, tradeoffs, checks, and details to gather. +- Development: guide sequencing, code boundaries, checks, and acceptance criteria. +- Testing: define release-blocking tests, evals, fixtures, and failure probes. +- Release: define rollout, observability, abort, rollback, and readiness details. +- Maintenance: define owners, drift checks, cleanup triggers, and refresh cadence. +- Existing artifact: use current code, docs, telemetry, incidents, or diffs as context for the next engineering decision; do not wait for a finished artifact before guiding design, build, release, or operation. +- Missing details: state assumptions and say what to check next instead of blocking lifecycle guidance. + +## Exceptions + +- Tabletop or disaster role play is appropriate before risky operational drills, but it does not replace technical validation. +- Production experiments may be inappropriate for safety-critical, destructive, or unbounded failure modes; use simulation or isolated environments. +- Compound failures are valid only after single-failure behavior is understood and observability is strong. +- Low-tier internal services may use lightweight drills when the user explicitly accepts the risk. + +## Response Quality Bar + +- Lead with the experiment hypothesis, blast-radius boundary, abort criteria, or results plan requested. +- Cover steady state, fault method, scope, telemetry, participant/communication plan, rollback actions, result capture, and learning loop before optional chaos-program breadth. +- Make recommendations actionable with exact fault injection, thresholds, stop trigger, rollback commands, details to capture, and rerun criteria where relevant. +- Name the details to inspect, such as dashboards, SLO signals, deployment markers, runbooks, dependency health, experiment logs, findings, and fix paths; do not state details you have not seen. +- Stay technology-agnostic by default: do not introduce provider, product, framework, database, protocol, or command names unless the user supplied them or explicitly requested tool-specific guidance. +- Stay inside resilience experiment design and execution. Route HA redesign or DR strategy only when the experiment exposes those as central gaps. +- Be concise: avoid generic chaos-engineering background and prefer compact experiment plans and findings tables. + +## Required Outputs + +- Experiment hypothesis. +- Experiment portfolio showing failure mode, expected user behavior, stop condition, last run, next trigger, and follow-up. +- Steady-state signal list and dashboard links. +- Fault injection method and blast-radius boundary. +- Abort criteria and rollback/fallback actions. +- Recurrence trigger, deadline, or cadence for critical recovery behavior. +- Participant, on-call, and communication plan. +- Result capture checklist. +- Findings, fixes, and rerun condition. + +## Checks Before Moving On + +- `hypothesis_check`: experiment states failure, expected behavior, and resilience mechanism. +- `blast_radius`: affected users, partitions, tenants, shards, locations, or traffic percentage are bounded. +- `abort_criteria`: stop thresholds and user decision point are defined before the experiment. +- `telemetry_check`: steady-state and causal signals are visible during the test. +- `learning_loop`: findings create maintained fixes or explicit risk acceptance. +- `recurrence_rule`: critical recovery behavior has a repeat trigger, deadline, or cadence tied to tier, topology, traffic, dependency, or incident learning. +- `fault_mode_coverage`: the experiment set covers the highest-risk failure modes or lists the skipped modes and reason. + +## Red Flags - Stop And Rework + +- The plan says "run chaos" but names no hypothesis. +- The failure can affect all customers before anyone can abort. +- Only infrastructure health is monitored while user impact is unknown. +- The drill depends on manual heroics that are not documented or repeatable. +- Findings are recorded but no fix path or rerun condition exists. + +## Common Mistakes + +| Mistake | Correction | +| --- | --- | +| Treating chaos as random failure | Inject a specific fault to test a specific expected behavior. | +| Starting too large | Check behavior in the smallest useful blast radius first. | +| Ignoring correctness | Include data correctness, freshness, and side effects, not just uptime. | +| Ending at the debrief | Convert findings into fixes, tests, and runbook updates. | diff --git a/plugins/sirmarkz/staff-engineer-mode/specialists/secure-sdlc-and-threat-modeling.md b/plugins/sirmarkz/staff-engineer-mode/specialists/secure-sdlc-and-threat-modeling.md new file mode 100644 index 00000000..6fc323f1 --- /dev/null +++ b/plugins/sirmarkz/staff-engineer-mode/specialists/secure-sdlc-and-threat-modeling.md @@ -0,0 +1,123 @@ +--- +name: secure-sdlc-and-threat-modeling +description: "Use when features need threat models, trust boundaries, abuse cases, control tests, or residual-risk records" +--- + +# Secure SDLC And Threat Modeling + +## Iron Law + +``` +NO SECURE DESIGN DECISION WITHOUT TRUST BOUNDARIES, DATA FLOWS, THREATS, CONTROLS, AND TESTS +``` + +If threats do not map to controls and verification, the decision is not actionable. + +## Overview + +Produces a trust-boundary and data-flow map, an abuse-case table, a control mapping with verification for each high-risk control, and a residual-risk register with explicit user acceptance and expiry. Refuses to accept controls that cannot be tested, gated, or observed. + +**Core principle:** model trust boundaries and abuse cases early, then turn threats into testable controls, explicit checks, and user-accepted residual risk. + +## When To Use + +- The user asks for threat modeling, secure design, abuse cases, secure SDLC, input validation, authorization decisions, or application security requirements. +- A change crosses trust boundaries, handles sensitive data, exposes an interface, adds privileged operations, or changes operational access. +- A design needs security acceptance criteria before implementation or launch. +- The user asks what attackers can abuse or what controls must exist. + +## When Not To Use + +- The main topic is build provenance, artifact signing, dependency inventory, or deployment admission; use `software-supply-chain-security` instead. +- The main topic is identity, secrets, cryptography lifecycle, or access lifecycle; use `identity-and-secrets` or `cryptography-and-key-lifecycle` instead. +- The main topic is LLM prompt, tool, or retrieval abuse; use `llm-application-security` instead. +- The request is broad legal/compliance program management; out of scope unless reframed as engineering controls. + +## Info To Gather + +- Current work phase, next decision, what is known, and assumptions where details are missing. +- Actors, identities, roles, trust boundaries, data flows, assets, and deployment surfaces. +- Data classification, sensitive fields, privacy constraints, logging/telemetry handling, and retention. +- Entry points, APIs, background jobs, admin paths, operational access, and third-party integrations. +- Abuse cases, attacker goals, known vulnerability classes, dependency assumptions, and misuse paths. +- Existing controls, tests, self-checks, scanning results, incidents, and residual risks. + +## Workflow + +1. **Map the system.** Identify actors, assets, trust boundaries, data flows, privileged paths, and externally reachable surfaces. +2. **Classify data and operations.** Mark sensitive data, destructive operations, admin actions, and integrity-critical decisions. +3. **List abuse cases.** Write what an attacker or malicious/buggy client tries to accomplish, not only what component might fail. +4. **Apply a threat frame.** Use spoofing, tampering, repudiation, disclosure, denial, privilege elevation, or equivalent categories to avoid blind spots. +5. **Map controls.** Assign authentication, authorization, validation, output handling, rate limits, audit, secrets handling, encryption, and isolation controls. +6. **Make controls testable.** Define unit/integration/security tests, self-checks, runtime monitors, or operational checks for each high-risk control. +7. **Record residual risk.** State compensating control, expiry, acceptance condition, and explicit user risk acceptance. +8. **Route specialized surfaces.** Identity/secrets, supply chain, LLM, tenant isolation, and vulnerability remediation go to their specialist skills when central. + +## Synthesized Default + +Use lightweight threat modeling tied to secure SDLC checks: trust-boundary map, abuse cases, control mapping, test plan, and residual-risk register. Prefer controls that are enforced in code, configuration, self-checks, runtime checks, or deployment checks over prose-only rules. + + + +## Phase Behavior + +- Ideation: identify risks, defaults, unknowns, options, and the next decision before code exists. +- Design: shape the target artifact, tradeoffs, checks, and details to gather. +- Development: guide sequencing, code boundaries, checks, and acceptance criteria. +- Testing: define release-blocking tests, evals, fixtures, and failure probes. +- Release: define rollout, observability, abort, rollback, and readiness details. +- Maintenance: define owners, drift checks, cleanup triggers, and refresh cadence. +- Existing artifact: use current code, docs, telemetry, incidents, or diffs as context for the next engineering decision; do not wait for a finished artifact before guiding design, build, release, or operation. +- Missing details: state assumptions and say what to check next instead of blocking lifecycle guidance. + +## Exceptions + +- Low-risk internal changes can use a small abuse-case checklist if no trust boundary, data sensitivity, or privileged operation changes. +- High-risk financial, privacy, safety, or admin paths need deeper checks and explicit user risk acceptance. +- Emergency fixes may document the minimal threat decision first and complete residual-risk mapping immediately after mitigation. +- Legal/compliance requirements can constrain controls, but this skill remains focused on engineering implementation and records. + +## Response Quality Bar + +- Lead with the threat-model decision, abuse-case table, control gap, or residual-risk register requested. +- Cover trust boundaries, actors, data flows, privileged paths, abuse cases, control mapping, verification, and residual responsibility before optional security breadth. +- Make recommendations actionable with control points, tests or self-checks, stop criteria, compensating controls, and expiry where relevant. +- Name the details to inspect, such as architecture/data-flow diagrams, auth paths, sensitive data stores, logs, deployment checks, security tests, and runtime checks; do not state details you have not seen. +- Stay technology-agnostic by default: do not introduce provider, product, framework, database, protocol, or command names unless the user supplied them or explicitly requested tool-specific guidance. +- Stay inside secure design and threat modeling. Use identity, supply-chain, tenant, LLM, or vulnerability skills only when the prompt makes that specialist surface central. +- Be concise: avoid generic vulnerability category lists and prefer system-specific abuse-case and control tables. + +## Required Outputs + +- Trust-boundary and data-flow map. +- Threat and abuse-case table. +- Security requirements and control mapping. +- Verification plan for controls. +- Residual-risk register with explicit user acceptance and expiry. +- Sensitive-data and logging decision. +- Follow-up checks for identity, supply-chain, tenant, LLM, or vulnerability work. + +## Checks Before Moving On + +- `boundary_check`: actors, trust boundaries, data flows, and privileged paths are explicit. +- `threat_coverage`: high-risk abuse cases map to controls. +- `verification_check`: every high-risk control has a test, self-check, runtime check, or source to inspect. +- `data_handling`: sensitive data storage, transmission, logging, and retention behavior is addressed. +- `risk_responsibility`: residual risks have explicit user acceptance, expiry, and compensating control. + +## Red Flags - Stop And Rework + +- The threat model lists generic vulnerability categories without system-specific abuse cases. +- Controls are stated but not testable. +- Admin or operational access is ignored. +- Sensitive data appears in logs, traces, errors, or analytics without controls. +- Residual risks have no user acceptance or expiry. + +## Common Mistakes + +| Mistake | Correction | +| --- | --- | +| Starting from checklists | Start from trust boundaries and abuse cases. | +| Treating security as a final checkpoint | Add controls to requirements, code, tests, release, and operations. | +| Focusing only on external attackers | Include insider, compromised credential, confused deputy, and abusive tenant paths. | +| Leaving controls as prose | Tie controls to tests, checks, or source records. | diff --git a/plugins/sirmarkz/staff-engineer-mode/specialists/slo-and-error-budgets.md b/plugins/sirmarkz/staff-engineer-mode/specialists/slo-and-error-budgets.md new file mode 100644 index 00000000..fd9a8082 --- /dev/null +++ b/plugins/sirmarkz/staff-engineer-mode/specialists/slo-and-error-budgets.md @@ -0,0 +1,136 @@ +--- +name: slo-and-error-budgets +description: "Use when user journeys need SLIs, SLOs, error budgets, burn-rate alerts, urgent-vs-follow-up alerts, or budget rules" +--- + +# SLO Error Budget Engineering + +## Iron Law + +``` +NO SLO WITHOUT A USER JOURNEY, ERROR-BUDGET MATH, AND BUDGET RESPONSE RULES +``` + +If the journey, window, target, budget, and budget response are missing, do not call the SLO complete. The response says when to send an urgent alert, when to create a follow-up, when to slow releases, who can override, and what reliability work comes next. + +## Overview + +Produces an SLI/SLO table tied to named user journeys, an error-budget calculation, multi-window burn-rate alert rules, and budget-state release rules. Refuses 100-percent targets, host-health proxies, and urgent alerts that do not name a user. + +**Core principle:** define the experience users are promised, measure it with SLIs, set an SLO that leaves an explicit error budget, and let that budget govern alert urgency and release risk. + +## When To Use + +- The user asks what reliability target, availability target, latency target, freshness target, correctness target, or durability target a service should meet. +- The user asks which alerts need urgent response, how burn-rate alerts should work, or how to connect alerts to SLOs. +- A launch, PRR, tier upgrade, or reliability decision needs SLI/SLO details. +- Existing alerts are noisy because they monitor causes instead of user-visible symptoms. + +## When Not To Use + +- The user only asks to build dashboards, traces, or logging without a user-visible objective; use `observability-and-alerting` instead. +- The user asks to reduce existing urgent-alert volume or on-call fatigue; use `oncall-health` instead unless new SLO policy is the main work. +- The user asks for cost optimization without reliability targets; use `cost-aware-reliability` instead. +- A live outage is underway; route to `incident-response-and-postmortems` first. + +## Info To Gather + +- Current work phase, next decision, what is known, and assumptions where details are missing. +- Critical user journeys, API operations, tenants, customer tiers, and response paths. +- Candidate SLIs for availability, latency, freshness, correctness, durability, and data loss. +- Current metrics, logs, traces, dashboards, alerts, and incident history. +- Missing-metric behavior, low-traffic detection strategy, and any synthetic or heartbeat signal needed to see user impact when organic traffic is sparse. +- Traffic shape: request volume, batch cadence, peak/seasonal behavior, and dependency fanout. +- External commitments or contractual SLAs, support tier, business-critical periods, and known customer commitments. +- Release process: canary checks, freeze rules, rollback authority, and reliability-work intake. + +## Workflow + +1. **Name the user journey.** Write the journey in user terms: "checkout succeeds", "message is delivered", "dataset is fresh", not "instances are healthy". +2. **Choose the SLI.** Prefer direct measures of good events over proxy infrastructure health. If direct measurement is missing, mark telemetry work as a blocker or explicit proxy risk; missing samples are unknown or bad, never green by default. +3. **Define good and bad events.** Specify numerator, denominator, exclusion rules, sampling source, and data-retention limits. +4. **Model health states.** Define healthy, degraded, unavailable, and recovering for the journey so partial failures and degraded quality do not disappear inside raw uptime. +5. **Set the SLO target and window.** Pick a target users need and the system can plausibly meet. Keep internal thresholds tighter than external customer commitments when they exist. Include availability, latency, freshness, recovery, or correctness targets only when they match the journey. Avoid 100 percent unless failure is impossible by construction. +6. **Calculate the budget.** Convert target and window into allowed bad events or bad minutes. Include low-traffic math so one event does not create nonsensical burn. +7. **Design alerts from burn.** Send urgent alerts on fast and sustained budget burn. Create follow-ups for slow burns. As a starting point, alert urgently only when short-window and longer-window burn both show urgent exhaustion risk, create follow-ups when multi-hour or multi-day burn threatens the window, and recompute thresholds for low traffic; use synthetic or heartbeat signals when real traffic cannot detect failure quickly enough. +8. **Handle latency correctly.** State where latency is measured and how percentiles are aggregated. Do not average percentiles across services or windows; merge compatible distributions or measure at the user-journey boundary. +9. **Define budget responses.** State what happens when budget is healthy, threatened, exhausted, or repeatedly exhausted: urgent alert, follow-up, slow release, override, or prioritize reliability work. +10. **Route gaps.** Missing telemetry goes to observability; staged rollout rules go to progressive delivery; launch aggregation goes to PRR. + +## Synthesized Default + +Use the standard SRE sequence as the default: user journey -> health model -> SLI -> SLO -> error budget -> multi-window burn-rate alert -> release and reliability rules. Treat reliability targets as design inputs and error budgets as a guardrail on change velocity rather than as a reason to stop delivery permanently. + + + +## Phase Behavior + +- Ideation: identify risks, defaults, unknowns, options, and the next decision before code exists. +- Design: shape the target artifact, tradeoffs, checks, and details to gather. +- Development: guide sequencing, code boundaries, checks, and acceptance criteria. +- Testing: define release-blocking tests, evals, fixtures, and failure probes. +- Release: define rollout, observability, abort, rollback, and readiness details. +- Maintenance: define owners, drift checks, cleanup triggers, and refresh cadence. +- Existing artifact: use current code, docs, telemetry, incidents, or diffs as context for the next engineering decision; do not wait for a finished artifact before guiding design, build, release, or operation. +- Missing details: state assumptions and say what to check next instead of blocking lifecycle guidance. + +## Exceptions + +- Internal tools may use advisory SLOs, follow-up alerts, and longer windows when urgent alerts would not protect users. +- External SLAs may force stricter internal SLOs; state both and keep the engineering SLO tighter than the contractual breach point. +- Low-volume services may need event-count thresholds or synthetic checks so one failed request does not create misleading burn. +- Diagnostic cause alerts can trigger urgent response only when they are urgent, actionable, and reliably precede user-visible impact. +- Planned maintenance, deliberate shedding, or abusive traffic may be excluded only with enumerable, time-bounded, and auditable rules. + +## Response Quality Bar + +- Lead with the SLO table, alert rules, budget-state decision, or telemetry blocker requested. +- Cover user journeys, SLIs, health states, target/window math, burn alerts, dashboards, release rules, and observability gaps before optional SRE breadth. +- Make recommendations actionable with metric definitions, thresholds, windows, alert routes, budget consequences, and follow-up checks where relevant. +- Name the details to inspect, such as request/event sources, numerator/denominator definitions, traffic volume, deployment markers, current burn, urgent-alert history, and dashboard links; do not state details you have not seen. +- Stay technology-agnostic by default: do not introduce provider, product, framework, database, protocol, or command names unless the user supplied them or explicitly requested tool-specific guidance. +- Stay inside SLO and error-budget engineering. Route rollout policy, observability instrumentation, or PRR only when they are the central unresolved risk. +- Be concise: avoid generic SRE exposition and prefer compact SLI/SLO and burn-policy tables. + +## Required Outputs + +- Critical journey inventory with tier and response path. +- SLI/SLO table with target, window, source metric, numerator, denominator, and exclusions. +- Health-state definitions for healthy, degraded, unavailable, and recovering conditions where partial degradation matters. +- Error-budget calculation in bad events or bad minutes. +- Burn-rate alert rules with urgent-alert and follow-up thresholds, including windows, budget-consumption rate, low-traffic handling, and diagnostic non-urgent rules. +- Missing-data policy for each SLI, plus synthetic or heartbeat detection for low-traffic journeys where needed. +- Dashboard requirements that show SLO state, burn, traffic, fault-domain scope where relevant, and recent deployments. +- Budget-state release rules and reliability-work triggers. +- Assumptions, proxy risks, blockers, and follow-up routes. + +## Checks Before Moving On + +- `journey_coverage`: every tier-1 or explicitly requested journey has a SLI, and user-visible success definition. +- `health_state`: the SLO can distinguish successful, degraded, unavailable, and excluded events where users experience partial failure. +- `math_check`: every SLO has a target, window, denominator, allowed bad events or minutes, and low-traffic handling. +- `promise_margin`: internal alert or stop thresholds are stricter than external commitments where such commitments exist. +- `missing_data_policy`: missing SLI samples have an explicit health meaning and response. +- `low_traffic_detection`: low-volume journeys use event-count math, synthetic checks, heartbeat signals, or a documented proxy risk. +- `alert_mapping`: every urgent alert maps to SLO burn or has a documented urgent/actionable exception. +- `budget_response`: exhausted-budget behavior is stated, including who can allow releases and what work is prioritized. +- `telemetry_check`: every SLI names its metric/log/event source or marks observability work as a blocker. + +## Red Flags - Stop And Rework + +- The SLI measures CPU, memory, instance health, queue depth, or host availability without connecting to user success. +- The SLO target is 100 percent because "this must never fail". +- Burn-rate thresholds are copied from another service without traffic-window math. +- The response recommends urgent alerts for every cause alert. +- The budget response says "improve reliability" without release, review, user-decision, or work-intake consequences. +- Latency SLOs average percentile values instead of measuring the journey or merging compatible distributions. +- Journey SLOs are synthesized from component SLOs without an explicit dependency model. + +## Common Mistakes + +| Mistake | Correction | +| --- | --- | +| Starting with dashboards | Start with journeys, then metrics. | +| Treating SLA and SLO as synonyms | SLA is external promise; SLO is engineering target; SLI is measurement. | +| Making all alerts urgent alerts | Alert urgently on budget burn; create follow-ups for slow or diagnostic signals. | +| Hiding missing telemetry | Mark telemetry as a blocker or proxy risk. | diff --git a/plugins/sirmarkz/staff-engineer-mode/specialists/software-supply-chain-security.md b/plugins/sirmarkz/staff-engineer-mode/specialists/software-supply-chain-security.md new file mode 100644 index 00000000..d2b84c55 --- /dev/null +++ b/plugins/sirmarkz/staff-engineer-mode/specialists/software-supply-chain-security.md @@ -0,0 +1,128 @@ +--- +name: software-supply-chain-security +description: "Use when source-to-deploy paths need protected source, isolated builds, provenance, signing, inventory, or admission" +--- + +# Software Supply Chain Security + +## Iron Law + +``` +NO PRODUCTION ARTIFACT WITHOUT SOURCE, BUILD, PROVENANCE, INTEGRITY, AND ADMISSION CHECKS +``` + +If an artifact cannot be traced back to accepted source and a trusted build path, it should not be trusted for production. + +## Overview + +Production should run artifacts whose source, build, dependencies, and confirmation path can be verified. + +**Core principle:** protect the source-to-deploy chain with traceable changes, isolated builds, provenance, artifact integrity, least-privilege automation, and deployment verification. + +## When To Use + +- The user asks about build/deploy security, builder isolation, artifact signing, provenance, dependency inventories, deployment admission, secret scanning, or build/deploy integrity. +- A production path lacks a clear record of what source and build produced an artifact. +- Automation credentials can modify source, build, registry, deployment, or infrastructure. +- You need supply-chain controls or records for release integrity. + +## When Not To Use + +- The work is routine package updates or dead-code cleanup; use `dependency-and-code-hygiene` instead. +- The issue is a deployed vulnerability with patch SLA; use `vulnerability-management` instead. +- The question is runtime authorization or service access; use `identity-and-secrets` instead. +- The request is broad compliance program management; out of scope unless framed as engineering records. + +## Info To Gather + +- Current work phase, next decision, what is known, and assumptions where details are missing. +- Repositories, branches, change acceptance rules, merge rights, and source protection. +- Build system, workers, isolation, inputs, dependencies, environment, and reproducibility needs. +- Artifact types, registries, signing, checksums, provenance, dependency inventories, and retention. +- Deployment path, admission controls, environment promotion, and rollback. +- Automation credentials, token scopes, secret exposure, and third-party integrations. +- Scanning coverage, vulnerability checkpoint, and incident/exception process. + +## Workflow + +1. **Map source to deploy.** Draw every step from code change through build, artifact, registry, deployment, and runtime admission. +2. **Protect source.** Require traceable accepted changes, branch protections, responsibility, and tamper-evident history for production paths. +3. **Harden builders.** Use isolated or ephemeral build environments for production artifacts; minimize mutable state and privileged credentials. +4. **Record provenance.** Produce metadata linking artifact identity, source revision, accepted change, build steps, builder identity, dependency inputs, build time, and confirmation path. Tier-critical paths should make this metadata verifiable at deployment. +5. **Protect artifacts.** Sign or otherwise verify integrity; store artifacts in controlled registries with retention and rollback. +6. **Generate inventories.** Produce structured, machine-readable dependency inventories when they support vulnerability response, customer requests, or release checks workflows; name the consumer so the artifact is not ritual. +7. **Decide reproducibility level.** State whether the path needs byte-identical, declared-nondeterminism, or content-equivalent rebuild records, and record any expected differences. +8. **Standardize secure pipelines.** Use reusable pipeline modules for production paths so scanning, integrity checks, dependency inventories, user confirmations, and secure compute are not optional per repository. +9. **Control deployment.** Verify artifact integrity/provenance at admission and keep environment promotion traceable. +10. **Constrain automation.** Use least-privilege, short-lived credentials and secret scanning across source/build paths. +11. **Screen common attack classes.** Check for dependency confusion, typo or name-squatting, compromised package publishing, build-cache poisoning, unchecked install hooks, and compromised automation credentials. + +## Synthesized Default + +Use accepted source, controlled production pipelines, isolated builds, provenance, signed or integrity-verified artifacts, dependency inventory, least-privilege automation, secret scanning, and deployment admission checks for production paths. Keep routine dependency hygiene and deployed vulnerability remediation as adjacent but separate workflows. + + + +## Phase Behavior + +- Ideation: identify risks, defaults, unknowns, options, and the next decision before code exists. +- Design: shape the target artifact, tradeoffs, checks, and details to gather. +- Development: guide sequencing, code boundaries, checks, and acceptance criteria. +- Testing: define release-blocking tests, evals, fixtures, and failure probes. +- Release: define rollout, observability, abort, rollback, and readiness details. +- Maintenance: define owners, drift checks, cleanup triggers, and refresh cadence. +- Existing artifact: use current code, docs, telemetry, incidents, or diffs as context for the next engineering decision; do not wait for a finished artifact before guiding design, build, release, or operation. +- Missing details: state assumptions and say what to check next instead of blocking lifecycle guidance. + +## Exceptions + +- Low-risk prototypes may use lighter controls if isolated from production data and deployment. +- Legacy build systems may need staged improvements; record missing provenance/signing as exceptions with expiry and compensating controls. +- Dependency inventories are useful when consumed for vulnerability, customer, or release checks workflows; do not generate unused artifacts as ritual. +- Emergency patches can use expedited paths only with post-facto provenance and acceptance checks. +- Release engineering covers reproducible build mechanics; this skill covers the trust boundary, provenance expectations, artifact integrity, and admission policy. + +## Response Quality Bar + +- Lead with the source-to-deploy risk, control gap, provenance plan, or exception register requested. +- Cover source acceptance, builder trust, artifact integrity, provenance, dependency inventory, deployment admission, automation credentials, and secret scanning before optional supply-chain breadth. +- Make recommendations actionable with control locations, validation commands, admission checks, exception expiry, and remediation steps where relevant. +- Name the details to inspect, such as protected branch settings, build identity, isolation model, artifact metadata, signatures or digests, dependency-inventory consumers, deploy policy, and credential scopes; do not state details you have not seen. +- Stay technology-agnostic by default: do not introduce provider, product, framework, database, protocol, or command names unless the user supplied them or explicitly requested tool-specific guidance. +- Stay inside supply-chain integrity. Route routine dependency hygiene or deployed vulnerability remediation only when those are the central unresolved risk. +- Be concise: avoid generic framework background and prefer compact control matrices and record maps. + +## Required Outputs + +- Source-to-deploy supply-chain map. +- Control matrix for source, build, artifact, registry, deployment, and automation. +- Provenance and artifact integrity plan with minimum fields: artifact identity, source revision, accepted change, builder identity, dependency inputs, build time, confirmation path, and verification location. +- Structured dependency inventory policy with producer, consumer, retention, and vulnerability checkpoint. +- Build and deployment credential hardening plan. +- Secret scanning and exposure response plan. +- Exceptions with expiry, and compensating controls. + +## Checks Before Moving On + +- `source_acceptance`: production source changes require accepted source and protected merge path. +- `builder_trust`: build environment identity, isolation, and credential scope are documented. +- `provenance_check`: production artifacts have source/build provenance or a tracked exception. +- `integrity_check`: deployment path verifies artifact integrity before promotion/admission. +- `credential_check`: automation credentials are least privilege, short lived where possible, and secret-scanned. + +## Red Flags - Stop And Rework + +- Anyone with build access can deploy unaccepted code. +- Production artifacts are rebuilt differently per environment without traceability. +- Long-lived automation tokens can modify source, artifacts, and deployment. +- Dependency inventories are generated but never used for vulnerability response or release records. +- Artifact signing exists but deployment never verifies it. + +## Common Mistakes + +| Mistake | Correction | +| --- | --- | +| Scanning as the only control | Add provenance, integrity, least privilege, and admission. | +| Trusting the registry blindly | Verify artifact identity and provenance at deployment. | +| Mixing routine updates with supply-chain trust | Route routine dependency hygiene separately. | +| Ignoring build credentials | Treat automation credentials as production access. | diff --git a/plugins/sirmarkz/staff-engineer-mode/specialists/state-machine-correctness.md b/plugins/sirmarkz/staff-engineer-mode/specialists/state-machine-correctness.md new file mode 100644 index 00000000..191d2306 --- /dev/null +++ b/plugins/sirmarkz/staff-engineer-mode/specialists/state-machine-correctness.md @@ -0,0 +1,126 @@ +--- +name: state-machine-correctness +description: "Use when designing or building state machines, protocols, workflows, or concurrency logic needing invariants" +--- + +# Systems Correctness And Formal Validation + +## Iron Law + +``` +NO HIGH-STAKES STATE MACHINE WITHOUT MUST-NEVER RULES, MUST-EVENTUALLY RULES, AND COUNTEREXAMPLE CHECKS +``` + +If you cannot state what must never happen and what must eventually happen, you cannot show that the design handles its critical states. + +## Overview + +Some bugs are too subtle for example-based tests and too expensive to discover in production. + +**Core principle:** express critical correctness properties as invariants, then validate them with the strongest practical combination of model checking, property tests, simulation, fuzzing, runtime checks, and review. + +## When To Use + +- The user is designing, building, or changing a state machine, protocol, workflow, or concurrency boundary and needs correctness rules before relying on examples. +- The user asks about property-based testing or fuzzing of behavior that crosses a state machine, protocol, concurrency boundary, or trust boundary, where examples cannot cover the input or interleaving space. +- A design includes distributed locks, leader election, consensus, replication, retries with mutation, workflows, money movement, authorization state, or irreversible actions. +- A bug would cause data loss, double execution, cross-tenant access, financial inconsistency, or security boundary failure. +- Tests pass for examples, but concurrency, ordering, timing, crash, or retry interleavings remain uncertain. + +## When Not To Use + +- The request is normal unit, integration, end-to-end, or CI merge-check design with no state-machine or invariant under test; use `testing-and-quality-gates`. +- The fuzz target is purely a parser, format decoder, or input validator with no protocol or state-machine surface; use `testing-and-quality-gates`. +- The main question is storage choice, database-backed workflow correctness, or consistency semantics; use `distributed-data-and-consistency` unless high-assurance validation of the storage protocol itself is central. +- The main question is retry, timeout, circuit-breaker, or backoff policy rather than correctness of the underlying state machine; use `dependency-resilience`. +- The system is low-risk and ordinary example-based testing is proportional. + +## Info To Gather + +- Current work phase, next decision, what is known, and assumptions where details are missing. +- State machine, actors, operations, messages, retries, timers, crashes, recovery, and concurrency points. +- Safety properties: what must never happen. +- Liveness properties: what must eventually happen, and under which assumptions. +- Consistency, idempotency, ordering, durability, authorization, and isolation invariants. +- Existing tests, fuzzers, simulations, model specs, incident examples, and known counterexamples. +- Mapping from model behavior to implementation code, logs, metrics, and runtime monitors. + +## Workflow + +1. **Name the critical property.** Write invariants in plain language before choosing tools. +2. **Bound the model.** Include only state, actors, timing, failures, and nondeterminism needed to test the property. +3. **Choose validation strength.** Match the technique to the invariant. Use property-based testing and fuzzing when the invariant is local and the input or interleaving space exceeds what examples cover; use deterministic simulation when timing, scheduling, crash, or retry interleavings dominate; use model checking when the protocol or concurrency interleaving is the source of risk; reserve formal verification for cryptographic, consensus, or safety-critical mechanisms. Move up the validation ladder when the lower technique cannot cover the state space; do not stop at examples for high-stakes invariants. Bounded property tests on pure logic and parser-only fuzzing with no state-machine or invariant under test belong in `testing-and-quality-gates`. +4. **Search for counterexamples.** Treat each failing trace as design feedback, not as a tool nuisance. +5. **Connect model to code.** Record which code paths implement each transition and which tests or monitors check the mapping. +6. **Verify recovery paths.** Include crash, retry, duplicate, reorder, timeout, partial write, and restart behavior. +7. **Add runtime checks.** Monitor invariants that can be checked in production without leaking sensitive data or harming users. +8. **Re-run on design changes.** Update specs, properties, and generated cases when the protocol or state machine changes. + +## Synthesized Default + +Use lightweight formal or semi-formal validation for high-stakes stateful behavior. Start with plain-language invariants and counterexample search, then select tools proportional to risk. Do not require formal verification everywhere; require explicit properties where ordinary tests cannot cover the state space. + + + +## Phase Behavior + +- Ideation: identify risks, defaults, unknowns, options, and the next decision before code exists. +- Design: shape the target artifact, tradeoffs, checks, and details to gather. +- Development: guide sequencing, code boundaries, checks, and acceptance criteria. +- Testing: define release-blocking tests, evals, fixtures, and failure probes. +- Release: define rollout, observability, abort, rollback, and readiness details. +- Maintenance: define owners, drift checks, cleanup triggers, and refresh cadence. +- Existing artifact: use current code, docs, telemetry, incidents, or diffs as context for the next engineering decision; do not wait for a finished artifact before guiding design, build, release, or operation. +- Missing details: state assumptions and say what to check next instead of blocking lifecycle guidance. + +## Exceptions + +- Full formal verification may be justified for cryptographic, consensus, safety-critical, or cross-tenant isolation mechanisms. +- Property-based tests may be enough when the state space is implementation-local and failure impact is bounded. +- Deterministic simulation is preferable when implementation timing, scheduling, or crash recovery is the main uncertainty. +- Runtime invariant checks may be sampled or delayed when full checking would harm privacy, cost, or latency. + +## Response Quality Bar + +- Lead with the invariant set, model boundary, counterexample, validation method, or blocker requested. +- Cover safety/liveness properties, state actors, messages, timing, failures, retries, recovery, code mapping, and runtime checks before optional formal-methods breadth. +- Make recommendations actionable with model scope, properties to test, counterexample handling, recovery cases, checks, and stop criteria where relevant. +- Name the details to inspect, such as protocol states, transition rules, failure assumptions, trace logs, property-test results, simulation output, model checker traces, and code links; do not state details you have not seen. +- Stay technology-agnostic by default: do not introduce provider, product, framework, database, protocol, or command names unless the user supplied them or explicitly requested tool-specific guidance. +- Stay inside correctness validation. Route distributed-data consistency, tenant isolation, or cryptography only when those are the central unresolved risk. +- Be concise: avoid generic formal-methods advocacy and prefer compact property lists, model boundaries, and counterexample tables. + +## Required Outputs + +- Correctness property list with safety and liveness split. +- State-machine or protocol model boundary. +- Validation method selection and rationale. +- Counterexample log and design changes. +- Code-to-model mapping. +- Recovery/interleaving test plan. +- Runtime invariant or reconciliation plan. + +## Checks Before Moving On + +- `invariant_list`: critical safety and liveness properties are written in plain, testable language. +- `model_boundary`: actors, state, messages, timing, and failure assumptions are explicit. +- `counterexample_search`: validation attempts to find failing traces, not just confirm expected cases. +- `code_mapping`: each modeled transition maps to implementation code, tests, or runtime checks. +- `recovery_cases`: duplicate, reorder, retry, crash, timeout, and partial-failure cases are covered or explicitly exempted. + +## Red Flags - Stop And Rework + +- "Exactly once", "no split brain", or "strong consistency" is asserted without invariants. +- The model omits retries, duplicate messages, crash recovery, or clock assumptions that exist in production. +- Property tests only replay hand-picked examples. +- A counterexample is dismissed because it is unlikely rather than impossible or risk-accepted. +- Runtime behavior cannot be traced back to the model. + +## Common Mistakes + +| Mistake | Correction | +| --- | --- | +| Modeling the whole system | Model the smallest state machine that protects the invariant. | +| Confusing tests with properties | Write the rule first, then generate or search cases. | +| Ignoring liveness assumptions | State what timing, retry, and failure assumptions allow progress. | +| Letting specs drift | Update model and invariant checks when implementation changes. | diff --git a/plugins/sirmarkz/staff-engineer-mode/specialists/tenant-isolation.md b/plugins/sirmarkz/staff-engineer-mode/specialists/tenant-isolation.md new file mode 100644 index 00000000..f4344fa5 --- /dev/null +++ b/plugins/sirmarkz/staff-engineer-mode/specialists/tenant-isolation.md @@ -0,0 +1,137 @@ +--- +name: tenant-isolation +description: "Use when multi-tenant systems need tenant context, data partitioning, quotas, cross-tenant tests, or safe telemetry" +--- + +# Tenant Isolation And Data Protection + +## Iron Law + +``` +NO TENANT-SENSITIVE PATH WITHOUT TENANT CONTEXT, ACCESS BOUNDARY, QUOTA, AUDIT, AND PRIVACY CONTROL +``` + +If a request or query can lose tenant context, cross-tenant leakage or impact is only a matter of time. + +> This skill assumes a multi-tenant deployment serving more than one customer or organization on shared infrastructure. A single-tenant deployment may still need it if PII or privacy domains create internal boundaries; otherwise route privacy work to `privacy-and-data-lifecycle`. + +## Overview + +Multi-tenancy fails when tenant context is optional. + +**Core principle:** carry tenant and data classification through every request, query, log, metric, trace, audit event, quota, and operational workflow. + +## When To Use + +- The user asks about multi-tenancy, tenant isolation, PII, privacy, noisy neighbors, cross-tenant blast radius, tenant quotas, or tenant-aware logging. +- A service stores, queries, caches, logs, exports, or processes data for multiple customers, organizations, or privacy domains. +- A bug could expose one tenant's data to another or let one tenant consume shared capacity. +- A design needs tenant-aware audit, encryption, retention, deletion, or access controls. + +## When Not To Use + +- The request is general authentication/authorization without tenant or data boundary concerns; use `identity-and-secrets` instead. +- The request is broad privacy lifecycle, minimization, retention, deletion, or privacy-safe telemetry without tenant-boundary concerns; use `privacy-and-data-lifecycle` instead. +- The main issue is public abuse or DDoS at the edge; use `edge-traffic-and-ddos-defense` instead. +- The work is only supply-chain or artifact integrity; use `software-supply-chain-security` instead. + +## Info To Gather + +- Current work phase, next decision, what is known, and assumptions where details are missing. +- Tenant model: silo, pool, bridge, organization/account hierarchy, shared services, and administrative boundaries. +- Data classification, PII/sensitive fields, retention, deletion, export, and residency constraints. +- Request, query, cache, event, batch, search, analytics, and support/admin paths that carry tenant data. +- Access controls, tenant context propagation, activity logs, row/object boundaries, and break-glass behavior. +- Quotas, rate limits, concurrency caps, noisy-neighbor risks, and per-tenant isolation needs. +- Admission point for tenant limits, dynamic limit update path, fair-share behavior, and privacy-safe impact scoping. +- Logging, metrics, traces, crash/error reports, and support tooling that may expose sensitive data. + +## Workflow + +1. **Define tenancy.** State what tenant means, how tenant IDs are assigned, and which resources are tenant-scoped. Define the model: silo means dedicated stack per tenant; pool means shared stack with logical isolation; bridge means shared control plane with tenant-dedicated data or runtime boundaries. +2. **Map tenant context.** Follow tenant context through request handling, storage, caches, events, jobs, logs, metrics, traces, and admin tools. +3. **Choose isolation model.** Use silo, pool, bridge, hybrid, or isolation-group boundaries based on data sensitivity, blast radius, scale, cost, and tenant-specific residency or compliance needs. Isolation groups separate sets of tenants from each other while preserving finer isolation inside each group. +4. **Choose data partitioning.** State whether tenants use separate stores, separate schemas/namespaces, shared schemas with enforced tenant predicates, or tenant-scoped encryption and credentials. +5. **Enforce data boundaries.** Apply tenant filters, scoped credentials, row/object boundaries, query guards, cache-key tenant assertions, and cross-tenant tests. +6. **Control noisy neighbors.** Add per-tenant quotas, rate limits, concurrency caps, and load-shedding rules where shared capacity exists; enforce cheap admission checks before expensive work when possible and define how limits change safely during an event. +7. **Protect privacy surfaces.** Minimize, redact, tokenize, encrypt, or segregate sensitive data in logs, telemetry, exports, and support views. +8. **Handle tenant offboarding.** Propagate deletion and access removal through stores, caches, indexes, derived data, exports, backup expiry, and support tooling. +9. **Audit high-risk access.** Record administrative, support, export, deletion, and cross-tenant operations in tenant-scoped activity logs; define retention long enough for investigation, compliance, and incident investigation. +10. **Verify isolation.** Use tests, probes, reviews, and monitoring for cross-tenant reads/writes and capacity abuse. + +## Synthesized Default + +Make tenant context mandatory and enforce it at multiple layers: application, data access, cache/event/job processing, audit, and observability. Choose the weakest shared-tenancy model that still satisfies blast-radius and data-boundary requirements, then combine tenant quotas with privacy-aware logging and cross-tenant tests. + + + +## Phase Behavior + +- Ideation: identify risks, defaults, unknowns, options, and the next decision before code exists. +- Design: shape the target artifact, tradeoffs, checks, and details to gather. +- Development: guide sequencing, code boundaries, checks, and acceptance criteria. +- Testing: define release-blocking tests, evals, fixtures, and failure probes. +- Release: define rollout, observability, abort, rollback, and readiness details. +- Maintenance: define owners, drift checks, cleanup triggers, and refresh cadence. +- Existing artifact: use current code, docs, telemetry, incidents, or diffs as context for the next engineering decision; do not wait for a finished artifact before guiding design, build, release, or operation. +- Missing details: state assumptions and say what to check next instead of blocking lifecycle guidance. + +## Exceptions + +- Single-tenant deployments can still need this skill when PII, privacy, or data-protection controls are central. +- Stronger silo isolation is warranted for highly sensitive tenants or regulatory boundaries even if cost is higher. +- Shared pooled models are acceptable when tenant context, quotas, and tests are strong enough for the risk. +- Emergency support access may cross normal boundaries only with justification, time limit, audit, and review. + +## Response Quality Bar + +- Lead with the isolation model, cross-tenant risk, boundary-control plan, or test gap requested. +- Cover tenant context propagation, data access boundaries, cache/event/job paths, quotas, privacy-safe telemetry, support access, and cross-tenant tests before optional tenancy breadth. +- Make recommendations actionable with enforcement layers, query/key rules, quotas, tenant-scoped activity logs with retention, test cases, and stop criteria where relevant. +- Name the details to inspect, such as request flows, schema keys, cache keys, job payloads, event envelopes, support-tool logs, quota metrics, audit retention settings, and cross-tenant test results; do not state details you have not seen. +- Stay technology-agnostic by default: do not introduce provider, product, framework, database, protocol, or command names unless the user supplied them or explicitly requested tool-specific guidance. +- Stay inside tenant isolation and data protection. Route general privacy or identity work only when it materially changes the isolation decision. +- Be concise: avoid generic multi-tenancy background and prefer compact propagation maps and boundary-control tables. + +## Required Outputs + +- Tenant isolation model and rationale. +- Tenant context propagation map. +- Data partitioning and isolation-group decision when applicable. +- Data classification and sensitive-field handling plan. +- Access, query, cache, event, and job boundary controls. +- Tenant offboarding and deletion propagation plan. +- Noisy-neighbor quota and capacity policy. +- Dynamic tenant-limit update path and privacy-safe impact scoping signals. +- Privacy-safe logging/telemetry/support review. +- Tenant-scoped audit log requirements, including covered events, protected fields, retention period or retention policy, and review responsibility. +- Cross-tenant test requirements, including forced-tenant mismatch, missing-tenant-filter detection, random tenant-ID probes, and cache-key assertions. + +## Checks Before Moving On + +- `tenant_context`: every request/query/job/event/cache path preserves tenant context or is explicitly tenant-neutral. +- `data_boundary`: data access controls enforce tenant isolation where shared stores exist. +- `privacy_check`: sensitive data handling is defined for logs, traces, metrics, errors, exports, and support tools. +- `quota_check`: shared capacity has tenant-aware quotas or an explicit risk acceptance. +- `early_admission`: tenant or caller limits apply before expensive shared work where feasible. +- `dynamic_limit_path`: emergency or routine limit changes have a safe update, rollback, and verification path. +- `tenant_impact_scope`: tenant impact can be scoped with privacy-safe operational signals. +- `activity_log_check`: tenant-scoped activity logs cover high-risk access and define retention for forensics and incident investigation. +- `cross_tenant_test`: tests or probes cover unauthorized cross-tenant read/write paths. + +## Red Flags - Stop And Rework + +- Tenant ID is passed as an optional parameter. +- Logs or traces include raw PII or tenant secrets. +- Background jobs process tenant data without tenant-scoped responsibility. +- Shared caches omit tenant from keys. +- Support tools can access tenant data without audit. + +## Common Mistakes + +| Mistake | Correction | +| --- | --- | +| Treating tenant isolation as only authz | Enforce tenant context through data, cache, jobs, telemetry, and audit. | +| Ignoring noisy neighbors | Add tenant-aware quotas and saturation signals. | +| Trusting manual review | Add cross-tenant tests and query guards. | +| Logging for convenience | Redact, tokenize, or omit sensitive fields. | diff --git a/plugins/sirmarkz/staff-engineer-mode/specialists/test-data-engineering.md b/plugins/sirmarkz/staff-engineer-mode/specialists/test-data-engineering.md new file mode 100644 index 00000000..9d932524 --- /dev/null +++ b/plugins/sirmarkz/staff-engineer-mode/specialists/test-data-engineering.md @@ -0,0 +1,147 @@ +--- +name: test-data-engineering +description: "Use when designing fixtures, golden files, or production snapshots needing anonymization, freshness, or drift checks" +--- + +# Test Data Engineering + +## Iron Law + +``` +NO TEST RELIES ON DATA THE TEST CANNOT REPRODUCE OR RESTORE +``` + +A green test backed by lost-provenance data does not stand on its own. The next refresh, the next anonymization sweep, or the next schema change will turn it red without any code change. + +## Overview + +Produces a fixture inventory with scope and regeneration path per fixture, an anonymization policy for any test data sourced from production, a freshness-versus-determinism decision per fixture class, and a drift-detection plan that fires when production data shape diverges from the data the tests run on. Refuses to call a test passing when the data it relies on cannot be reproduced or restored. + +**Core principle:** test data is a production artifact. If a fixture cannot be regenerated, anonymized, or restored on demand, the tests that depend on it are an outage waiting for the next refresh. + +## When To Use + +- The user is designing, changing, or maintaining fixtures, golden files, snapshots, captured production data, or synthetic test data. +- A flaky or order-dependent test is suspected to depend on shared mutable fixture state. +- A test relies on production-sourced data that may contain personal or sensitive information and the anonymization policy is unclear or absent. +- A schema change in production broke a contract test, integration test, or migration that ran on stale fixtures. +- You need to decide between freshly captured production data, snapshotted captures, hand-built fixtures, or synthetic generation for a given test layer. +- A regression appears in production that fixtures did not cover because the fixture predated the data shape that caused it. +- An ML or analytics fixture is drifting from production distribution and graders, thresholds, or correctness checks are losing signal. + +## When Not To Use + +- The work is overall test strategy, check placement, runtime budgets, or flake policy; use `testing-and-quality-gates`. +- The work is privacy retention, deletion, export, erasure, or data classification across systems; use `privacy-and-data-lifecycle`. +- The work is a producer/consumer schema contract evolution decision; use `data-contracts`. +- The work is a production batch or streaming pipeline's freshness, lineage, or replay; use `data-pipeline-reliability`. +- The work is database migration safety, locks, backfills, or index rollout; use `database-operations`. +- The work is ML model training or serving drift detection on production traffic; use `ml-reliability-and-evaluation`. +- The work is producing eval datasets and graders for an LLM workflow; use `llm-evaluation`. + +## Info To Gather + +- Current work phase, next decision, what is known, and assumptions where details are missing. +- Fixture inventory source: which test layers (unit, component, contract, integration, end-to-end, performance, security, ML/LLM) hold fixtures, where each fixture lives, and how it is loaded. +- Per-fixture metadata: name, scope (per test, per file, per suite, per process, per environment), generation source (hand-written, synthetic generator, captured production, derived golden), and refresh cadence. +- Production-source captures: which fixtures are captured from production, when each was captured, what fields were included, and what anonymization or redaction was applied. +- Schema versions: production schema version per source, fixture schema version per fixture, and any pinned version skew between them. +- Golden-file inventory: which tests assert against golden outputs and the procedure for regenerating each one. +- Data-shape signals: production distributions for fields the tests rely on (cardinality, nullability, value ranges, categorical sets) and the latest measurements available. +- Privacy classification of any captured data: personal data, sensitive personal data, regulated data, secrets, customer-identifying data. +- Test isolation: whether fixtures are mutated by tests, whether mutations leak across tests, and whether test order affects outcome. +- Restore/reproduce procedure: for each fixture class, the documented steps to recreate it from scratch, and the time those steps take. + +## Workflow + +1. **Build the fixture inventory.** Reconcile fixtures discovered in the test tree, in CI artifact storage, and in any captured-data store. A fixture in only one of those is the first orphan. +2. **Classify each fixture.** Assign one class: hand-built (small, deterministic), synthetic-generated (programmatic, parameterized), captured (sampled or copied from production), derived golden (the test's expected output), or shared seeded state (a fixture multiple tests depend on). +3. **Decide freshness versus determinism per class.** Hand-built and synthetic fixtures should be deterministic by default; captured fixtures trade determinism for realism and need a refresh policy; derived goldens are deterministic by construction but require a regeneration procedure when behavior intentionally changes; shared seeded state is the most fragile and should be minimized. +4. **Set the scope rule per fixture.** Default to per-test scope. Move to per-file or per-suite scope only when the setup cost demands it and the fixture is read-only. Per-process or per-environment shared state requires explicit isolation guarantees and a teardown command. +5. **Apply the anonymization policy to captured data.** For each captured fixture, identify direct identifiers, quasi-identifiers, sensitive fields, and free-text fields that may carry personal data. Anonymization may require pseudonymization, generalization, suppression, redaction, or synthetic replacement. Hash-only is rarely sufficient because reidentification through quasi-identifiers and timing is common. +6. **Make captures restorable.** For each captured fixture, record source, capture timestamp, anonymization transform, and the procedure to recapture under current schema. A capture without these is unrestorable and must be regenerated or replaced. +7. **Establish drift detection.** For fields the tests rely on, compare production distribution to fixture distribution at a defined cadence. Alert when categorical sets diverge, nullability shifts, value ranges drift, or new required fields appear. Drift is a fixture-staleness signal before it is a test failure. +8. **Generate hard-to-construct data deliberately.** For domains with combinatorial complexity (financial calculations, rare-but-important edge cases, multi-step workflows), use parameterized generators, property-based generation, or scenario builders rather than hand-typing values that age out. +9. **Govern golden files.** State the regeneration procedure, the regeneration check, and the rule for what counts as an intentional change versus an accidental drift. Goldens regenerated without review erase the test's signal. +10. **Cull orphans.** Fixtures with no callers, absent sources, or sources that no longer exist must be removed or given a reproducible generator/restore path. Unrecoverable fixtures become production-critical by accident. +11. **Make findings directly actionable.** Each finding names the fixture path, the affected test path, and the local remediation: anonymize, regenerate, replace with synthetic, narrow scope, or delete. + +## Synthesized Default + +Prefer synthetic, parameterized fixtures generated at test time. Use captured production data only when realism is required and the anonymization, refresh, and restore procedures are real. Default to per-test scope. Goldens are deterministic by construction and have a documented regeneration procedure. Drift detection compares fixture distributions to production at a defined cadence. Captured data without anonymization or restore procedure is removed. + + + +## Phase Behavior + +- Ideation: identify risks, defaults, unknowns, options, and the next decision before code exists. +- Design: shape the target artifact, tradeoffs, checks, and details to gather. +- Development: guide sequencing, code boundaries, checks, and acceptance criteria. +- Testing: define release-blocking tests, evals, fixtures, and failure probes. +- Release: define rollout, observability, abort, rollback, and readiness details. +- Maintenance: define owners, drift checks, cleanup triggers, and refresh cadence. +- Existing artifact: use current code, docs, telemetry, incidents, or diffs as context for the next engineering decision; do not wait for a finished artifact before guiding design, build, release, or operation. +- Missing details: state assumptions and say what to check next instead of blocking lifecycle guidance. + +## Exceptions + +- Performance and load test fixtures may use captured-shape (size, distribution) without captured content; the realism the test needs is shape, not values. +- Compatibility and replay tests for legacy data may pin a real captured corpus; the corpus must still be anonymized and restorable, and the freshness policy may be paused with a stated reason. +- Property-based generators may be slow on first run; pin a seed for reproducibility and treat the seed as part of the fixture. +- Regulated workloads may forbid certain anonymization techniques because they are reversible under the threat model; record the exception and the resulting test-coverage impact. +- A snapshot/golden may pin third-party output that you do not control; in that case responsibility is "you accept the snapshot until the upstream changes" and the regeneration trigger is upstream-version change. + +## Response Quality Bar + +- Lead with the fixture inventory, anonymization rule, freshness/determinism decision, drift-detection plan, or golden-file rule requested. +- Cover classification, scope, anonymization, restore procedure, drift detection, and golden regeneration before optional fixture-tooling breadth. +- Make recommendations actionable with per-fixture path, classification, scope, refresh cadence, anonymization transform, restore procedure, and the local fix for each finding. +- Name the details to inspect, such as the fixture inventory, capture timestamps, anonymization transforms, production-distribution measurements, and the regeneration procedure for each golden; do not state restorability without the procedure. +- Stay technology-agnostic by default: do not introduce provider, product, framework, database, protocol, or command names unless the user supplied them or explicitly requested tool-specific guidance. +- Stay inside the data layer of testing. Route check placement and flake policy, privacy program work, schema-contract evolution, pipeline freshness, migration safety, ML drift, and LLM eval datasets to the responsible specialist. +- Be concise: prefer compact inventory and decision tables over generic fixture-management prose. + +## Required Outputs + +- Fixture inventory with name, classification, scope, generation source, refresh cadence, and restore procedure per fixture. +- Anonymization policy for captured data covering direct identifiers, quasi-identifiers, sensitive fields, free-text, and the transform applied per field type. +- Freshness-versus-determinism decision per fixture class with the rule that governs each. +- Drift-detection plan listing the fields tracked, the production source, the comparison cadence, the alert threshold, and the local triage procedure for a drift alert. +- Golden-file rule: regeneration procedure, explicit checker, intentional-change-versus-drift rule, and the test path per golden. +- Orphan and responsibility report for fixtures with no callers or no recoverable source, with the remediation per fixture. +- Hard-to-construct data plan: which scenarios use generators, the seed/version policy for reproducibility, and the generator path. +- Restore-and-reproduce procedure per fixture class with documented steps and expected runtime. +- Follow-up routes to test strategy, privacy lifecycle, data contracts, pipeline reliability, database operations, ML reliability, or LLM evaluation as needed. + +## Checks Before Moving On + +- `fixture_inventory_present`: a single inventory reconciles fixtures across the test tree, CI storage, and any capture store; mismatches are listed. +- `classification_assigned`: every fixture has one class from hand-built, synthetic, captured, derived golden, or shared seeded. +- `scope_documented`: every fixture has a stated scope and shared state has isolation and teardown checks. +- `anonymization_applied`: every captured fixture has an applied anonymization transform sufficient against direct and quasi-identifier reidentification or an explicit recorded exception. +- `restore_procedure`: every fixture has a documented restore-or-regenerate procedure and an estimated runtime. +- `drift_detection_plan`: production-distribution comparison is defined for the fields the tests rely on, with cadence, threshold, and local triage procedure. +- `golden_rule`: each golden file has a regeneration procedure, and intentional-change rule. +- `orphan_culled`: fixtures with no callers or no recoverable source are listed with remediation. + +## Red Flags - Stop And Rework + +- A fixture exists but no one knows where it came from or how to regenerate it. +- Captured production data is in the test tree with no anonymization beyond field renaming. +- Tests share mutable state and pass only in a specific order. +- Goldens are regenerated automatically when they fail, erasing the test's signal. +- A schema change broke tests because fixtures pinned an obsolete shape and no drift signal fired. +- Anonymization is documented but the transform has not been re-applied since the last production-schema change. +- Hard-to-construct fixtures are hand-typed in many test files, drifting from each other and from production. + +## Common Mistakes + +| Mistake | Correction | +| --- | --- | +| Treating fixtures as throwaway test code | Inventory fixtures with classification, and restore procedure. | +| Field-rename as anonymization | Apply transforms against direct and quasi-identifier reidentification; record the policy. | +| Shared mutable seeded state | Default to per-test scope; require isolation and teardown commands for shared state. | +| Auto-regenerating goldens on failure | Require an explicit check for golden regeneration; distinguish intentional change from drift. | +| Capturing once and forgetting | Set a refresh cadence and a recapture procedure tied to schema changes. | +| Hand-typing combinatorial scenarios | Use generators or scenario builders with seeds for reproducibility. | +| No drift signal | Compare fixture distributions to production at a defined cadence with a triage path. | diff --git a/plugins/sirmarkz/staff-engineer-mode/specialists/testing-and-quality-gates.md b/plugins/sirmarkz/staff-engineer-mode/specialists/testing-and-quality-gates.md new file mode 100644 index 00000000..da0e6c34 --- /dev/null +++ b/plugins/sirmarkz/staff-engineer-mode/specialists/testing-and-quality-gates.md @@ -0,0 +1,140 @@ +--- +name: testing-and-quality-gates +description: "Use when test strategy, merge/release checks, CI budgets, static analysis, mutation tests, flakes, or ratchets matter" +--- + +# Testing And Quality Checks + +## Iron Law + +``` +EVERY TEST CHECKS A NAMED RISK; EVERY BLOCKING CHECK HAS A FAILURE RESPONSE +``` + +Tests exist to exercise a specific risk; "we have tests" without naming the risk each test exercises is weak signal. A blocking check without a written failure response teaches people to ignore it. For a solo developer the response can be a single sentence: what the agent should inspect, what command verifies the fix, and when to quarantine or downgrade the check. + +## Overview + +Quality checks should catch real risk early without turning delivery into ritual. + +**Core principle:** place fast, deterministic, high-signal checks before merge; reserve slower or broader checks for the stage where they can show something useful. + +## When To Use + +- The user asks for test strategy, merge checks, release checks, CI checks, quality standards, test pyramid/trophy, static analysis, coverage policy, or verification requirements. +- You need to decide what must pass before merge, before release, or before launch. +- A legacy codebase needs quality ratchets without stopping all work. +- Existing tests or CI are slow, flaky, low-signal, or ignored. + +## When Not To Use + +- The user asks about generic review behavior, responsibility routing, or review latency with no merge or release check decision. +- The user asks for canary or production rollout checks; use `progressive-delivery` instead. +- The request is production chaos or failover testing; use `resilience-experiments` or `high-availability-design` instead. +- The question is pure formatting/style enforcement; automate it and keep this skill focused on risk. + +## Info To Gather + +- Current work phase, next decision, what is known, and assumptions where details are missing. +- Supported behaviors, critical journeys, system tier, risk areas, and recent defect history. +- Existing test inventory: unit/component/contract/integration/end-to-end/performance/security/accessibility/static checks. +- Pre-traffic health checks, critical-path sanity checks, production-like integration checks, synthetic or canary checks, and performance bottleneck tests. +- CI structure, runtime, flake rate, failure responsibility, and required versus advisory checks. +- Coverage signal, mutation or fault-injection needs, legacy findings, and known blind spots. +- Release process and where checks can run without excessive feedback delay. + +## Workflow + +1. **Classify risk.** Identify correctness, compatibility, security, reliability, performance, data, and accessibility risks introduced by the change. +2. **Place tests low.** Prefer the cheapest deterministic check that exercises the behavior; use broader tests only for cross-boundary confidence. +3. **Define a test taxonomy.** Group checks by dependency and runtime cost so fast in-memory/component tests protect merge, deployment tests protect release, and production probes protect rollout. +4. **State suite composition.** For CI reduction, flake cleanup, or suite redesign, include a compact current or target layer mix such as unit/component, contract/integration, and end-to-end counts or ratios, with one rationale tied to speed, determinism, and risk coverage. +5. **Separate check types.** Pre-merge checks should be fast and high-signal; use a default budget such as p95 under 10 minutes for the full pre-merge lane and under 5 minutes for a fast path. Pre-release checks can be broader; production checks belong to rollout. +6. **Check before traffic.** For serving systems, startup/readiness checks and critical-path sanity checks should pass before new capacity accepts real traffic. +7. **Make checks actionable.** Every blocking check needs failure instructions and a path to fix or quarantine. +8. **Handle flakes ruthlessly.** A flaky blocker teaches people to ignore checks. Fix, quarantine, or downgrade with a dated expiry. +9. **Use ratchets for legacy.** Prevent new critical findings and gradually reduce existing debt rather than requiring impossible cleanup. +10. **Place high-assurance tests deliberately.** Bounded property tests on pure logic and ordinary fuzzing can live in this skill; concurrency/protocol invariants, model checking, deterministic simulation, and counterexample-driven validation route to formal validation. +11. **Choose test data safely.** Use synthetic data for pre-merge by default, anonymized or captured production-like data in controlled release stages, and explicit privacy checks for sensitive fixtures. +12. **Use mutation testing selectively.** Apply it to safety, security, financial, or dense branch logic where coverage percentage is misleading; do not make it a universal check. +13. **Keep style mechanical.** Formatting and simple style should be automated, not debated manually. +14. **Verify the strategy.** Confirm each critical risk has a check, test, check artifact, or explicit exception. + +## Synthesized Default + +Use a risk-based test strategy with fast deterministic pre-merge checks, focused integration/contract checks for boundaries, static/security analysis in the developer path, and broader release checks only where they add confidence. Push tests left when they can run reliably before merge; push tests right only when production reality is needed. Block on high-signal checks; make low-signal checks advisory until they are trustworthy. + + + +## Phase Behavior + +- Ideation: identify risks, defaults, unknowns, options, and the next decision before code exists. +- Design: shape the target artifact, tradeoffs, checks, and details to gather. +- Development: guide sequencing, code boundaries, checks, and acceptance criteria. +- Testing: define release-blocking tests, evals, fixtures, and failure probes. +- Release: define rollout, observability, abort, rollback, and readiness details. +- Maintenance: define owners, drift checks, cleanup triggers, and refresh cadence. +- Existing artifact: use current code, docs, telemetry, incidents, or diffs as context for the next engineering decision; do not wait for a finished artifact before guiding design, build, release, or operation. +- Missing details: state assumptions and say what to check next instead of blocking lifecycle guidance. + +## Exceptions + +- Legacy systems may use non-regression ratchets before enforcing absolute thresholds. +- Flaky tests should not block until fixed or quarantined with clear responsibility. +- Safety-critical, financial, security-sensitive, or data-destructive paths may require deeper verification, formal methods, or simulation. +- Generated or third-party code may use contract and integration checks instead of unit-level responsibility. + +## Response Quality Bar + +- Lead with the test strategy, check matrix, blocker decision, or quality-risk map requested. +- Cover risk mapping, check stage, failure response, flake policy, static/security checks, and legacy ratchets before optional testing breadth. +- For slow-CI, bypassed-CI, flaky-suite, or suite-redesign prompts, always state the intended test-layer composition as counts or ratios and explain why that mix gives faster, more deterministic signal than the current shape. +- Make recommendations actionable with blocking/advisory status, validation commands, quarantine rules, stop criteria, and rollout of new checks where relevant. +- Name the details to inspect, such as defect history, critical journeys, CI runtime, flake rate, coverage gaps, static findings, and release failure data; do not state details you have not seen. +- Stay technology-agnostic by default: do not introduce provider, product, framework, database, protocol, or command names unless the user supplied them or explicitly requested tool-specific guidance. +- Stay inside verification and quality checks. Route production rollout checks or chaos testing only when they are the central unresolved risk; generic review workflow has no routed specialist. +- Be concise and prefer compact risk-to-check matrices, but always state: a flake-rate metric paired with a quarantine timer, a coverage metric+target paired with a meaningful-vs-vanity caveat, a CI runtime target paired with how it is measured, and per-layer test ratios with rationale when test composition is in scope. + +## Required Outputs + +- Test strategy by risk area and lifecycle stage. +- Check matrix: pre-merge, pre-release, launch, and advisory checks. +- Critical-path sanity and pre-traffic health checks with expected behavior and stop condition. +- Runtime budget for blocking lanes with a measurement source (p95 from CI history, not aspirational), and the action when the budget is exceeded. +- Test composition by layer (unit/component, contract/integration, end-to-end, and specialized checks) with counts or ratios and rationale whenever cutting CI time, handling flakes, or redesigning a suite. +- Failure response for each blocking check. +- Static analysis, security scanning, and dependency check policy. +- Coverage or mutation policy where it adds useful signal — name the metric, the target, and the meaningful-vs-vanity caveat (changed-code coverage, critical-path coverage). +- Test data sourcing and privacy/sensitivity policy. +- Flake management and quarantine policy — state the flake-rate threshold (e.g. >1% rerun rate) and the quarantine timer (e.g. 24-48h to quarantine or downgrade with expiry). +- Legacy ratchet plan with cadence, target metric, and next reduction step. + +## Checks Before Moving On + +- `risk_mapping`: every critical risk maps to a test, check artifact, or explicit exception. +- `check_signal`: every blocking check has high signal, and failure response. +- `flake_policy`: flaky checks have fix, quarantine, downgrade, or expiry decision. +- `stage_fit`: each check runs at the earliest stage where it can check the intended property. +- `critical_path_sanity`: critical user paths have sanity checks that validate behavior, not only process health. +- `pre_traffic_health`: new capacity passes startup/readiness checks before accepting real traffic. +- `promotion_checks`: production-like integration, synthetic, canary, or performance checks stop promotion when critical behavior fails. +- `suite_shape`: test-layer counts or ratios match the risk profile, with most pre-merge confidence coming from cheap deterministic checks and only bounded broad tests blocking. +- `legacy_ratchet`: existing debt has a non-regression rule and reduction plan. + +## Red Flags - Stop And Rework + +- A slow end-to-end suite is the only meaningful pre-merge check. +- Coverage percentage is treated as quality without behavior/risk mapping. +- Flaky tests are required but failures are routinely rerun until green. +- Static analysis results appear after merge with no local fix path or suppression rule. +- Checks block but nobody can explain what failure means. +- High-assurance protocol or concurrency validation is treated as ordinary CI without invariants or counterexamples. + +## Common Mistakes + +| Mistake | Correction | +| --- | --- | +| Testing implementation shape | Test supported behavior and contracts. | +| Blocking on noisy tools | Start advisory, tune signal, then enforce. | +| One giant quality check | Split by lifecycle stage and risk. | +| Demanding instant legacy perfection | Use ratchets and prevent new debt. | diff --git a/plugins/sirmarkz/staff-engineer-mode/specialists/vulnerability-management.md b/plugins/sirmarkz/staff-engineer-mode/specialists/vulnerability-management.md new file mode 100644 index 00000000..81027c10 --- /dev/null +++ b/plugins/sirmarkz/staff-engineer-mode/specialists/vulnerability-management.md @@ -0,0 +1,127 @@ +--- +name: vulnerability-management +description: "Use when deployed or release-bound vulnerabilities need exploitability triage, patch SLAs, rollout, or exceptions" +--- + +# Vulnerability Management And Patch SLA + +## Iron Law + +``` +NO VULNERABILITY ACCEPTANCE WITHOUT EXPOSURE ANALYSIS, DEADLINE, MITIGATION, AND VERIFICATION +``` + +If a vulnerable artifact remains deployed, the reason and recheck date must be explicit. + +## Overview + +Vulnerability management is exposure reduction with deadlines, rollout safety, and verification. + +**Core principle:** prioritize by exploitability, exposure, reachability, asset criticality, and compensating controls, not by severity score alone. + +## When To Use + +- The user is deciding vulnerability/advisory triage, patch SLAs, exploitability signals, remediation exceptions, time-to-fix, rollout, or live-fix verification for deployed or release-bound code. +- A vulnerable artifact is deployed or about to be promoted and needs exposure, exploitability, exception, patch-order, or verification decisions. +- A scanner finds a vulnerability in a deployed service, image, dependency, runtime, or infrastructure component. +- You need risk-based prioritization or exception rules. +- A vulnerability fix needs rollout and verification results. + +## When Not To Use + +- The work is routine dependency updates with no urgent deployed risk; use `dependency-and-code-hygiene` instead. +- The main gap is build provenance, dependency inventory, or artifact trust; use `software-supply-chain-security` instead. +- The question is secure design of a new feature; use `secure-sdlc-and-threat-modeling` instead. +- The request is legal disclosure strategy; out of scope unless framed as engineering remediation details. + +## Info To Gather + +- Vulnerability or advisory ID, affected package/component, fixed versions, exploit status, exploitability signals, severity, and advisory details. +- Deployed assets, versions, exposure, network reachability, privileges, data access, and service tier. +- Runtime reachability, feature usage, compensating controls, tenant/customer impact, and exploit preconditions. +- Patch path, test coverage, rollout plan, rollback risk, maintenance window, and user decision deadline. +- Existing exceptions, business constraints, and verification method. + +## Workflow + +1. **Identify deployed exposure.** Determine whether the vulnerable component is present, inventoried, reachable, exploitable, and customer-impacting in production. +2. **Prioritize by risk.** Combine known exploitation, exploit likelihood, external exposure, runtime reachability, privilege, sensitive data, service tier, and compensating controls. Active exploitation or externally reachable sensitive paths override nominal severity. +3. **Use a default SLA ladder.** Emergency means active exploitation or externally reachable critical path: mitigate immediately and patch inside days. High means reachable production path with sensitive data, privilege, or tier-critical service: patch inside one to two weeks. Medium means reachable internal or bounded path: patch in the next planned release window. Low means not reachable or strongly compensated: track with expiry and recheck. +4. **Set deadline and SLA.** Set remediation deadline based on risk tier and active exploitation, not severity score alone. +5. **Choose remediation.** Patch, upgrade, remove, disable feature, isolate, constrain inputs, or add compensating controls until patch is safe. +6. **Handle no-patch cases.** When no fix exists, define workaround, isolation, feature disablement, input filtering, monitoring, and next recheck trigger. +7. **Roll out safely.** Use tests, canary, rollback, and compatibility checks for risky base/runtime/dependency changes. +8. **Verify fixed state.** Check that the vulnerable artifact is no longer deployed or the exploit path is mitigated. +9. **Record exceptions.** Use expiry, compensating control, recheck trigger, and residual risk. +10. **Feed upstream gaps.** Missing inventory/provenance goes to supply-chain security; routine update backlog goes to dependency hygiene. + +## Synthesized Default + +Use risk-based triage with exploitation signals, exposure, reachability, privilege, service tier, sensitive data, dependency inventory, and compensating controls. Patch active exploitation and internet-exposed critical paths aggressively, then verify deployed fixed state. + + + +## Phase Behavior + +- Ideation: identify risks, defaults, unknowns, options, and the next decision before code exists. +- Design: shape the target artifact, tradeoffs, checks, and details to gather. +- Development: guide sequencing, code boundaries, checks, and acceptance criteria. +- Testing: define release-blocking tests, evals, fixtures, and failure probes. +- Release: define rollout, observability, abort, rollback, and readiness details. +- Maintenance: define owners, drift checks, cleanup triggers, and refresh cadence. +- Existing artifact: use current code, docs, telemetry, incidents, or diffs as context for the next engineering decision; do not wait for a finished artifact before guiding design, build, release, or operation. +- Missing details: state assumptions and say what to check next instead of blocking lifecycle guidance. + +## Exceptions + +- Active exploitation can justify emergency rollout outside normal cadence if mitigation risk is lower than exposure risk. +- Non-reachable vulnerable code may receive lower priority, but reachability details must be documented. +- Patches with high regression risk may require compensating controls and staged rollout before full remediation. +- Upstream-unfixed vulnerabilities need mitigation, monitoring, expiry, and recheck cadence; do not wait on an upstream fix when a local compensating control is available. + +## Response Quality Bar + +- Lead with the triage decision, patch SLA, remediation plan, exception, or verification gap requested. +- Cover exposure, reachability, exploitation signals, service tier, sensitive data, compensating controls, rollout risk, SLA, and fixed-state verification before optional vulnerability topics. +- Make recommendations actionable with deadlines, fallback paths, deploy batches, rollback plans, mitigation steps, and recheck cadence where relevant. +- Name the details to inspect, such as affected assets, deployed artifact versions, scanner findings, exploitability signals, reachability checks, exploit status, patch availability, and production verification; do not state details you have not seen. +- Stay technology-agnostic by default: do not introduce provider, product, framework, database, protocol, or command names unless the user supplied them or explicitly requested tool-specific guidance. +- Stay inside vulnerability remediation. Route routine dependency hygiene or supply-chain provenance only when those are the central unresolved risk. +- Be concise: avoid generic severity-score explanations and prefer compact triage and remediation tables. + +## Required Outputs + +- Vulnerability triage table. +- Exposure and reachability assessment. +- Patch SLA, deadline, and risk tier. +- Risk decision record showing exploitation, exposure, reachability, privilege, sensitive data, service tier, and compensating controls. +- Remediation and rollout plan. +- Exception record when remediation cannot meet SLA. +- Verification details that vulnerable deployed state is removed or mitigated. +- Metrics for aging, SLA breach, and time to fix, with the start event defined as detection in deployed scope or fix availability when no fix existed at detection time. + +## Checks Before Moving On + +- `exposure_check`: affected deployed assets, reachability, privilege, and data sensitivity are assessed. +- `risk_tier`: exploitability, exposure, service tier, and compensating controls are considered. +- `remediation_sla`: every remediation has deadline, and fallback path. +- `rollout_check`: risky fixes have test, rollout, and rollback plan. +- `verification_check`: fixed or mitigated production state is verified. + +## Red Flags - Stop And Rework + +- Severity score alone determines urgency. +- Vulnerabilities are marked accepted with no expiry or compensating control. +- Scanner "fixed" status is trusted without verifying deployed artifacts. +- Patch rollout risk is ignored for critical services. +- Missing inventory prevents triage and is not surfaced as a blocker. +- No-patch vulnerabilities sit idle without workaround, monitoring, and recheck date. + +## Common Mistakes + +| Mistake | Correction | +| --- | --- | +| Treating all highs alike | Prioritize by exploitation, exposure, reachability, and tier. | +| Closing tickets on merge | Verify the fixed artifact is deployed or mitigated. | +| Exception sprawl | Require expiry, compensating control, and recheck. | +| Ignoring rollout risk | Use safe-change practices for risky patches. | diff --git a/plugins/sirmarkz/staff-engineer-mode/specialists/web-release-gates.md b/plugins/sirmarkz/staff-engineer-mode/specialists/web-release-gates.md new file mode 100644 index 00000000..3282f8ff --- /dev/null +++ b/plugins/sirmarkz/staff-engineer-mode/specialists/web-release-gates.md @@ -0,0 +1,124 @@ +--- +name: web-release-gates +description: "Use when planning browser releases needing loading, interaction, layout, runtime-error, telemetry, or budget checks" +--- + +# Frontend Performance Release Checks + +## Iron Law + +``` +NO CLIENT RELEASE CHECK WITHOUT USER-CENTRIC METRICS, JOURNEY BUDGETS, FIELD/LAB SIGNALS, AND ROLLBACK CRITERIA +``` + +If a release can make the client experience worse without tripping a check, the check is incomplete. + +## Overview + +Client-side quality is production reliability for the user's device and network. + +**Core principle:** check client-facing releases on field-user experience, journey-level budgets, runtime errors, and accessibility smoke checks, not only build success. + +## When To Use + +- The user is planning, building, changing, or reviewing a browser-delivered or client-rendered release that can affect user-perceived loading, interaction readiness, visual stability, runtime errors, payload weight, or client-side release safety. +- You need release thresholds for routes, screens, or user journeys. +- Field-user telemetry, lab checks, deploy markers, feature flags, or automated accessibility smoke checks are needed to stop client regressions from shipping. + +## When Not To Use + +- The request is product UX strategy, visual design, SEO strategy, or broad accessibility-program management. +- Backend latency is the only issue and client user experience is not central; use `performance-and-capacity` instead. +- The request is general CI check policy; use `testing-and-quality-gates` instead. +- The issue is mobile native release stability; use `mobile-release-engineering` instead. + +## Info To Gather + +- Current work phase, next decision, what is known, and assumptions where details are missing. +- Critical routes, screens, journeys, user segments, devices, network classes, and supported clients. +- Field metrics: user-perceived loading, interaction readiness, visual stability, runtime errors, and journey-level latency. +- Lab metrics: payload weight, critical path work, client initialization or rendering cost, dependency weight, and synthetic checks. +- Current budgets, deploy markers, feature flags, rollout controls, and rollback path. +- Accessibility smoke checks that can be automated reliably. +- Privacy constraints for real-user monitoring and error collection. + +## Workflow + +1. **Pick user journeys and routes.** Check what users actually experience, not only the application shell. +2. **Set budgets.** Define journey-level payload, dependency, critical path, rendering, and interaction budgets. +3. **Use field and lab signals.** Use lab checks for fast feedback and field data for real user impact. +4. **Segment enough to see regressions.** Track mobile/desktop, browser, device class, geography/network, and key customer segments where relevant. +5. **Check accessibility smoke checks.** Automate high-signal checks such as missing labels, landmarks, contrast failures detectable by tooling, and keyboard traps where feasible. +6. **Mark releases.** Attach deploy, config, and feature markers to client telemetry and error reports. +7. **Define stop/rollback.** State thresholds for halting rollout, disabling flags, reverting bundles, or forward-fixing. +8. **Route backend causes.** If client experience regresses due to backend saturation, follow up with capacity/performance. + +## Synthesized Default + +Use user-centric journey-level budgets, field monitoring, lab checks, runtime-error tracking, deploy markers, automated accessibility smoke checks, and explicit rollback criteria. Treat client performance regressions as release blockers when they affect critical journeys. + + + +## Phase Behavior + +- Ideation: identify risks, defaults, unknowns, options, and the next decision before code exists. +- Design: shape the target artifact, tradeoffs, checks, and details to gather. +- Development: guide sequencing, code boundaries, checks, and acceptance criteria. +- Testing: define release-blocking tests, evals, fixtures, and failure probes. +- Release: define rollout, observability, abort, rollback, and readiness details. +- Maintenance: define owners, drift checks, cleanup triggers, and refresh cadence. +- Existing artifact: use current code, docs, telemetry, incidents, or diffs as context for the next engineering decision; do not wait for a finished artifact before guiding design, build, release, or operation. +- Missing details: state assumptions and say what to check next instead of blocking lifecycle guidance. + +## Exceptions + +- Low-traffic routes may rely more on lab checks until enough field data exists. +- Accessibility smoke checks do not replace a full accessibility program; they catch release regressions within engineering scope. +- Experimental internal routes can use advisory budgets if isolated from customers. +- Emergency security fixes can ship with narrower performance checks if post-release monitoring and rollback are explicit. + +## Response Quality Bar + +- Lead with the release checks, blocking thresholds, or rollout decision requested. +- Cover user-centric metrics, journey budgets, field/lab signals, runtime errors, and rollback criteria before optional client quality topics. +- Make recommendations actionable with checks, stop conditions, and rollback or flag actions where relevant. +- For ticketing or release-readiness prompts, separate feature implementation tickets from release-control tickets; if a critical journey already regressed, lead with the hold, flag, or rollback decision before the ticket list. +- Name the details to inspect, such as field telemetry segments, lab checks, deploy markers, error rates, and payload/journey budgets; do not state details you have not seen. +- Stay technology-agnostic by default: do not introduce provider, product, framework, database, protocol, or command names unless the user supplied them or explicitly requested tool-specific guidance. +- Stay inside client release quality. Mention accessibility smoke checks only as release regression checks unless the prompt asks for broader accessibility work. +- Be concise: avoid generic web-performance background and prefer compact check matrices. + +## Required Outputs + +- Client runtime SLI/SLO table by journey, screen, or route. +- Performance budget for payload, dependency, critical path, rendering, and interaction costs. +- Field and lab measurement plan. +- Automated accessibility smoke-check list. +- CI/release check matrix with thresholds and failure response. +- Rollout, flag, and rollback criteria. +- Telemetry privacy notes. + +## Checks Before Moving On + +- `user_experience_check`: user-centric load, interaction, and visual stability metrics have journey-level targets. +- `budget_check`: payload, dependency, critical path, rendering, and interaction budgets exist with failure response. +- `field_lab_check`: both field and lab signals are used or a low-traffic exception is recorded. +- `a11y_smoke`: automated accessibility smoke checks are defined for release regressions. +- `rollback_check`: rollout halt, flag disable, revert, or forward-fix criteria are explicit. + +## Red Flags - Stop And Rework + +- Build success is the only client release check. +- Aggregate site metrics hide critical route regressions. +- Payload budgets exist but failures have no response path. +- Field monitoring collects sensitive data unnecessarily. +- Accessibility is treated as fully solved by one automated scan. + +## Common Mistakes + +| Mistake | Correction | +| --- | --- | +| Lab-only confidence | Combine fast lab checks with field user impact. | +| Global budgets only | Set route and journey budgets. | +| No deploy markers | Tag releases, config, and feature flags in telemetry. | +| Broad accessibility scope creep | Keep release checks to automated smoke checks and route larger work separately. | diff --git a/scripts/generate_plugins_json.py b/scripts/generate_plugins_json.py index 60393727..0923e96f 100644 --- a/scripts/generate_plugins_json.py +++ b/scripts/generate_plugins_json.py @@ -51,6 +51,11 @@ # copying the full upstream repository or leaving broken local references. "mturac/everything-openai-codex", } +EXTRA_MIRROR_PATHS = { + # Staff Engineer Mode exposes one router skill and loads routed specialist + # files from a top-level specialists/ directory at runtime. + "sirmarkz/staff-engineer-mode": ("specialists",), +} def normalize_relative_path(value: str) -> str: @@ -189,6 +194,7 @@ def collect_selected_paths( manifest: dict[str, object], all_names: set[str], plugin_root: PurePosixPath, + plugin: dict[str, str], ) -> set[str]: selected = {".codex-plugin/plugin.json"} @@ -214,6 +220,9 @@ def collect_selected_paths( if isinstance(screenshot, str): add_recursive_selection(selected, all_names, plugin_root, screenshot) + for extra_path in EXTRA_MIRROR_PATHS.get(f"{plugin['owner']}/{plugin['repo']}", ()): + add_recursive_selection(selected, all_names, plugin_root, extra_path) + return selected @@ -295,7 +304,7 @@ def mirror_plugin_bundle(plugin: dict[str, str]) -> tuple[dict[str, object], str selected_paths = ( collect_metadata_only_paths(manifest, names, plugin_root) if metadata_only - else collect_selected_paths(manifest, names, plugin_root) + else collect_selected_paths(manifest, names, plugin_root, plugin) ) mirrored_manifest = sanitize_metadata_only_manifest(manifest, plugin) if metadata_only else manifest