diff --git a/scripts/loop-data.mjs b/scripts/loop-data.mjs
index 942d8e5..e3846aa 100644
--- a/scripts/loop-data.mjs
+++ b/scripts/loop-data.mjs
@@ -407,37 +407,41 @@ export const loops = [
slug: "full-product-evaluation-loop",
title: "The full product evaluation loop",
summary:
- "Tests every major product capability and fixes outcomes below the quality bar.",
- seoTitle: "Full Product Evaluation Loop for AI Systems | Loop Library",
+ "Recreates production locally, tests every product surface, and fixes all verified bugs holistically.",
+ seoTitle: "Production-Grade Full Product Evaluation Loop | Loop Library",
description:
"A comprehensive product-quality workflow that evaluates realistic scenarios across every major capability, fixes weak outcomes, and reruns them to the defined bar.",
categoryLabel: "AI product evaluation workflow",
author: "Matthew Berman",
published: "2026-06-16",
- modified: "2026-06-17",
+ modified: "2026-06-21",
prompt:
- "Create [N] realistic scenarios covering every major capability. Before testing, define clear success criteria and choose a consistent evaluation method, such as pass/fail checks or a scoring rubric. Run every scenario under the same conditions and record evidence for each outcome. Fix the underlying cause of anything that does not meet the criteria, rerun the affected scenarios, and then rerun the complete set. Continue until every scenario meets the original quality bar.",
- verifyTitle: "Every one of the [N] scenarios meets the defined quality bar.",
+ "Build sanitized, production-scale local data under production-like settings. Inventory every user-facing feature, role, route, button, input, modal, state, and workflow; define documented acceptance criteria and finite risk-based edge cases for each. Test as a real user, logging every bug with reproduction evidence. Review findings for shared causes and dependencies; implement coherent fixes with regression tests, then rerun the full inventory. Stop at a clean pass or blocked handoff. Ask before production, sensitive data, or destructive actions.",
+ verifyTitle: "Every inventoried product surface meets its documented acceptance criteria.",
verifyDetail:
- "The final evaluated run covers every major capability under the original conditions.",
+ "The final full regression run covers every inventoried surface and its finite risk-based edge cases in the production-like local environment, with each reproducible bug fixed and backed by evidence.",
useWhen:
- "Use this for an end-to-end product evaluation when quality must be measured across the full feature set rather than a narrow regression or a few hand-picked examples.",
+ "Use this for an exhaustive, end-to-end application QA pass when a production-like local environment and complete interactive-surface coverage matter more than a narrow regression or sample of major features.",
steps: [
- "List every major capability, define the success criteria and evaluation method, choose [N], and allocate realistic scenarios across the product surface.",
- "Run the full set under consistent conditions and evaluate every outcome with evidence.",
- "Document each scenario that misses the criteria, fix the underlying issue, and add focused regression coverage where appropriate.",
- "Rerun affected scenarios and then the complete set until every outcome meets the original quality bar.",
+ "Build a sanitized or synthetic production-scale local dataset, mirror safe production settings, and record unavoidable differences.",
+ "Inventory every user-facing feature, role, route, control, state, and workflow; define documented acceptance criteria and a finite risk-based edge-case set for each item.",
+ "Exercise every inventory item as a real user under its normal and defined edge-case conditions, logging each bug immediately with reproducible evidence.",
+ "Review the complete bug set for shared causes, dependencies, and conflicting fixes, then implement the smallest coherent solution with regression coverage.",
+ "Rerun affected paths and the complete inventory; stop only at a clean full pass or an explicit blocked handoff.",
],
why:
- "A fixed capability map and consistent evaluation method make product quality visible across the whole system. Requiring a final complete run catches fixes that improve one scenario while weakening another.",
+ "A finite surface inventory prevents major controls and states from disappearing behind a few happy-path scenarios. Reviewing all findings before fixing them exposes shared causes and interactions, while the final full run catches changes that repair one path but weaken another.",
note:
- "Keep the scenario set representative and preserve failed examples. Aggregate results can hide severe misses, so require every scenario to clear the bar.",
+ "Do not copy secrets or sensitive production data into the local environment, touch production without approval, or count an untested or blocked surface as passing. Preserve the inventory, bug log, environment differences, and final evidence for review.",
keywords: [
- "AI product evaluation",
- "full product testing",
- "response scoring",
- "quality benchmark",
- "feature coverage",
+ "production-grade QA",
+ "production-like local testing",
+ "exhaustive product testing",
+ "real user testing",
+ "UI control coverage",
+ "edge case testing",
+ "bug documentation",
+ "full regression testing",
],
related: ["quality-streak-loop", "production-data-cleanup-loop"],
},
diff --git a/site/catalog.json b/site/catalog.json
index 97580db..8a528f6 100644
--- a/site/catalog.json
+++ b/site/catalog.json
@@ -478,28 +478,32 @@
},
"author": "Matthew Berman",
"published": "2026-06-16",
- "modified": "2026-06-17",
+ "modified": "2026-06-21",
"description": "A comprehensive product-quality workflow that evaluates realistic scenarios across every major capability, fixes weak outcomes, and reruns them to the defined bar.",
- "useWhen": "Use this for an end-to-end product evaluation when quality must be measured across the full feature set rather than a narrow regression or a few hand-picked examples.",
- "prompt": "Create [N] realistic scenarios covering every major capability. Before testing, define clear success criteria and choose a consistent evaluation method, such as pass/fail checks or a scoring rubric. Run every scenario under the same conditions and record evidence for each outcome. Fix the underlying cause of anything that does not meet the criteria, rerun the affected scenarios, and then rerun the complete set. Continue until every scenario meets the original quality bar.",
+ "useWhen": "Use this for an exhaustive, end-to-end application QA pass when a production-like local environment and complete interactive-surface coverage matter more than a narrow regression or sample of major features.",
+ "prompt": "Build sanitized, production-scale local data under production-like settings. Inventory every user-facing feature, role, route, button, input, modal, state, and workflow; define documented acceptance criteria and finite risk-based edge cases for each. Test as a real user, logging every bug with reproduction evidence. Review findings for shared causes and dependencies; implement coherent fixes with regression tests, then rerun the full inventory. Stop at a clean pass or blocked handoff. Ask before production, sensitive data, or destructive actions.",
"verification": {
- "title": "Every one of the [N] scenarios meets the defined quality bar.",
- "detail": "The final evaluated run covers every major capability under the original conditions."
+ "title": "Every inventoried product surface meets its documented acceptance criteria.",
+ "detail": "The final full regression run covers every inventoried surface and its finite risk-based edge cases in the production-like local environment, with each reproducible bug fixed and backed by evidence."
},
"steps": [
- "List every major capability, define the success criteria and evaluation method, choose [N], and allocate realistic scenarios across the product surface.",
- "Run the full set under consistent conditions and evaluate every outcome with evidence.",
- "Document each scenario that misses the criteria, fix the underlying issue, and add focused regression coverage where appropriate.",
- "Rerun affected scenarios and then the complete set until every outcome meets the original quality bar."
- ],
- "why": "A fixed capability map and consistent evaluation method make product quality visible across the whole system. Requiring a final complete run catches fixes that improve one scenario while weakening another.",
- "implementationNote": "Keep the scenario set representative and preserve failed examples. Aggregate results can hide severe misses, so require every scenario to clear the bar.",
+ "Build a sanitized or synthetic production-scale local dataset, mirror safe production settings, and record unavoidable differences.",
+ "Inventory every user-facing feature, role, route, control, state, and workflow; define documented acceptance criteria and a finite risk-based edge-case set for each item.",
+ "Exercise every inventory item as a real user under its normal and defined edge-case conditions, logging each bug immediately with reproducible evidence.",
+ "Review the complete bug set for shared causes, dependencies, and conflicting fixes, then implement the smallest coherent solution with regression coverage.",
+ "Rerun affected paths and the complete inventory; stop only at a clean full pass or an explicit blocked handoff."
+ ],
+ "why": "A finite surface inventory prevents major controls and states from disappearing behind a few happy-path scenarios. Reviewing all findings before fixing them exposes shared causes and interactions, while the final full run catches changes that repair one path but weaken another.",
+ "implementationNote": "Do not copy secrets or sensitive production data into the local environment, touch production without approval, or count an untested or blocked surface as passing. Preserve the inventory, bug log, environment differences, and final evidence for review.",
"keywords": [
- "AI product evaluation",
- "full product testing",
- "response scoring",
- "quality benchmark",
- "feature coverage"
+ "production-grade QA",
+ "production-like local testing",
+ "exhaustive product testing",
+ "real user testing",
+ "UI control coverage",
+ "edge case testing",
+ "bug documentation",
+ "full regression testing"
],
"related": [
{
diff --git a/site/catalog.md b/site/catalog.md
index 5f7b36e..ba4c73b 100644
--- a/site/catalog.md
+++ b/site/catalog.md
@@ -94,10 +94,10 @@ URL above.
## 010 — [The full product evaluation loop](https://signals.forwardfuture.ai/loop-library/loops/full-product-evaluation-loop/)
- Category: Evaluation
-- Use when: Use this for an end-to-end product evaluation when quality must be measured across the full feature set rather than a narrow regression or a few hand-picked examples.
-- Prompt: Create [N] realistic scenarios covering every major capability. Before testing, define clear success criteria and choose a consistent evaluation method, such as pass/fail checks or a scoring rubric. Run every scenario under the same conditions and record evidence for each outcome. Fix the underlying cause of anything that does not meet the criteria, rerun the affected scenarios, and then rerun the complete set. Continue until every scenario meets the original quality bar.
-- Verify: Every one of the [N] scenarios meets the defined quality bar. The final evaluated run covers every major capability under the original conditions.
-- Keywords: AI product evaluation, full product testing, response scoring, quality benchmark, feature coverage
+- Use when: Use this for an exhaustive, end-to-end application QA pass when a production-like local environment and complete interactive-surface coverage matter more than a narrow regression or sample of major features.
+- Prompt: Build sanitized, production-scale local data under production-like settings. Inventory every user-facing feature, role, route, button, input, modal, state, and workflow; define documented acceptance criteria and finite risk-based edge cases for each. Test as a real user, logging every bug with reproduction evidence. Review findings for shared causes and dependencies; implement coherent fixes with regression tests, then rerun the full inventory. Stop at a clean pass or blocked handoff. Ask before production, sensitive data, or destructive actions.
+- Verify: Every inventoried product surface meets its documented acceptance criteria. The final full regression run covers every inventoried surface and its finite risk-based edge cases in the production-like local environment, with each reproducible bug fixed and backed by evidence.
+- Keywords: production-grade QA, production-like local testing, exhaustive product testing, real user testing, UI control coverage, edge case testing, bug documentation, full regression testing
- Related: [The quality streak loop](https://signals.forwardfuture.ai/loop-library/loops/quality-streak-loop/), [The production data cleanup loop](https://signals.forwardfuture.ai/loop-library/loops/production-data-cleanup-loop/)
## 011 — [The test-suite speed loop](https://signals.forwardfuture.ai/loop-library/loops/test-suite-speed-loop/)
diff --git a/site/catalog.txt b/site/catalog.txt
index 5f7b36e..ba4c73b 100644
--- a/site/catalog.txt
+++ b/site/catalog.txt
@@ -94,10 +94,10 @@ URL above.
## 010 — [The full product evaluation loop](https://signals.forwardfuture.ai/loop-library/loops/full-product-evaluation-loop/)
- Category: Evaluation
-- Use when: Use this for an end-to-end product evaluation when quality must be measured across the full feature set rather than a narrow regression or a few hand-picked examples.
-- Prompt: Create [N] realistic scenarios covering every major capability. Before testing, define clear success criteria and choose a consistent evaluation method, such as pass/fail checks or a scoring rubric. Run every scenario under the same conditions and record evidence for each outcome. Fix the underlying cause of anything that does not meet the criteria, rerun the affected scenarios, and then rerun the complete set. Continue until every scenario meets the original quality bar.
-- Verify: Every one of the [N] scenarios meets the defined quality bar. The final evaluated run covers every major capability under the original conditions.
-- Keywords: AI product evaluation, full product testing, response scoring, quality benchmark, feature coverage
+- Use when: Use this for an exhaustive, end-to-end application QA pass when a production-like local environment and complete interactive-surface coverage matter more than a narrow regression or sample of major features.
+- Prompt: Build sanitized, production-scale local data under production-like settings. Inventory every user-facing feature, role, route, button, input, modal, state, and workflow; define documented acceptance criteria and finite risk-based edge cases for each. Test as a real user, logging every bug with reproduction evidence. Review findings for shared causes and dependencies; implement coherent fixes with regression tests, then rerun the full inventory. Stop at a clean pass or blocked handoff. Ask before production, sensitive data, or destructive actions.
+- Verify: Every inventoried product surface meets its documented acceptance criteria. The final full regression run covers every inventoried surface and its finite risk-based edge cases in the production-like local environment, with each reproducible bug fixed and backed by evidence.
+- Keywords: production-grade QA, production-like local testing, exhaustive product testing, real user testing, UI control coverage, edge case testing, bug documentation, full regression testing
- Related: [The quality streak loop](https://signals.forwardfuture.ai/loop-library/loops/quality-streak-loop/), [The production data cleanup loop](https://signals.forwardfuture.ai/loop-library/loops/production-data-cleanup-loop/)
## 011 — [The test-suite speed loop](https://signals.forwardfuture.ai/loop-library/loops/test-suite-speed-loop/)
diff --git a/site/feed.xml b/site/feed.xml
index 46357da..d91c5f9 100644
--- a/site/feed.xml
+++ b/site/feed.xml
@@ -114,7 +114,7 @@
Create [N] realistic scenarios covering every major capability. Before testing, define clear success criteria and choose a consistent evaluation method, such as pass/fail checks or a scoring rubric. Run every scenario under the same conditions and record evidence for each outcome. Fix the underlying cause of anything that does not meet the criteria, rerun the affected scenarios, and then rerun the complete set. Continue until every scenario meets the original quality bar.
+Build sanitized, production-scale local data under production-like settings. Inventory every user-facing feature, role, route, button, input, modal, state, and workflow; define documented acceptance criteria and finite risk-based edge cases for each. Test as a real user, logging every bug with reproduction evidence. Review findings for shared causes and dependencies; implement coherent fixes with regression tests, then rerun the full inventory. Stop at a clean pass or blocked handoff. Ask before production, sensitive data, or destructive actions.
Verify / stop
The final evaluated run covers every major capability under the original conditions.
+The final full regression run covers every inventoried surface and its finite risk-based edge cases in the production-like local environment, with each reproducible bug fixed and backed by evidence.
Use this for an end-to-end product evaluation when quality must be measured across the full feature set rather than a narrow regression or a few hand-picked examples.
+Use this for an exhaustive, end-to-end application QA pass when a production-like local environment and complete interactive-surface coverage matter more than a narrow regression or sample of major features.
A fixed capability map and consistent evaluation method make product quality visible across the whole system. Requiring a final complete run catches fixes that improve one scenario while weakening another.
+A finite surface inventory prevents major controls and states from disappearing behind a few happy-path scenarios. Reviewing all findings before fixing them exposes shared causes and interactions, while the final full run catches changes that repair one path but weaken another.
Keep the scenario set representative and preserve failed examples. Aggregate results can hide severe misses, so require every scenario to clear the bar.
+Do not copy secrets or sensitive production data into the local environment, touch production without approval, or count an untested or blocked surface as passing. Preserve the inventory, bug log, environment differences, and final evidence for review.