Merge pull request #59 from Forward-Future/codex/production-grade-product-qa

mberman84 · web-flow · commit 3a32ff81ca6d · 2026-06-21T16:17:44.000-07:00
[codex] Upgrade full product evaluation loop
diff --git a/scripts/loop-data.mjs b/scripts/loop-data.mjs
@@ -407,37 +407,41 @@ export const loops = [
     slug: "full-product-evaluation-loop",
     title: "The full product evaluation loop",
     summary:
-      "Tests every major product capability and fixes outcomes below the quality bar.",
-    seoTitle: "Full Product Evaluation Loop for AI Systems | Loop Library",
+      "Recreates production locally, tests every product surface, and fixes all verified bugs holistically.",
+    seoTitle: "Production-Grade Full Product Evaluation Loop | Loop Library",
     description:
       "A comprehensive product-quality workflow that evaluates realistic scenarios across every major capability, fixes weak outcomes, and reruns them to the defined bar.",
     categoryLabel: "AI product evaluation workflow",
     author: "Matthew Berman",
     published: "2026-06-16",
-    modified: "2026-06-17",
+    modified: "2026-06-21",
     prompt:
-      "Create [N] realistic scenarios covering every major capability. Before testing, define clear success criteria and choose a consistent evaluation method, such as pass/fail checks or a scoring rubric. Run every scenario under the same conditions and record evidence for each outcome. Fix the underlying cause of anything that does not meet the criteria, rerun the affected scenarios, and then rerun the complete set. Continue until every scenario meets the original quality bar.",
-    verifyTitle: "Every one of the [N] scenarios meets the defined quality bar.",
+      "Build sanitized, production-scale local data under production-like settings. Inventory every user-facing feature, role, route, button, input, modal, state, and workflow; define documented acceptance criteria and finite risk-based edge cases for each. Test as a real user, logging every bug with reproduction evidence. Review findings for shared causes and dependencies; implement coherent fixes with regression tests, then rerun the full inventory. Stop at a clean pass or blocked handoff. Ask before production, sensitive data, or destructive actions.",
+    verifyTitle: "Every inventoried product surface meets its documented acceptance criteria.",
     verifyDetail:
-      "The final evaluated run covers every major capability under the original conditions.",
+      "The final full regression run covers every inventoried surface and its finite risk-based edge cases in the production-like local environment, with each reproducible bug fixed and backed by evidence.",
     useWhen:
-      "Use this for an end-to-end product evaluation when quality must be measured across the full feature set rather than a narrow regression or a few hand-picked examples.",
+      "Use this for an exhaustive, end-to-end application QA pass when a production-like local environment and complete interactive-surface coverage matter more than a narrow regression or sample of major features.",
     steps: [
-      "List every major capability, define the success criteria and evaluation method, choose [N], and allocate realistic scenarios across the product surface.",
-      "Run the full set under consistent conditions and evaluate every outcome with evidence.",
-      "Document each scenario that misses the criteria, fix the underlying issue, and add focused regression coverage where appropriate.",
-      "Rerun affected scenarios and then the complete set until every outcome meets the original quality bar.",
+      "Build a sanitized or synthetic production-scale local dataset, mirror safe production settings, and record unavoidable differences.",
+      "Inventory every user-facing feature, role, route, control, state, and workflow; define documented acceptance criteria and a finite risk-based edge-case set for each item.",
+      "Exercise every inventory item as a real user under its normal and defined edge-case conditions, logging each bug immediately with reproducible evidence.",
+      "Review the complete bug set for shared causes, dependencies, and conflicting fixes, then implement the smallest coherent solution with regression coverage.",
+      "Rerun affected paths and the complete inventory; stop only at a clean full pass or an explicit blocked handoff.",
     ],
     why:
-      "A fixed capability map and consistent evaluation method make product quality visible across the whole system. Requiring a final complete run catches fixes that improve one scenario while weakening another.",
+      "A finite surface inventory prevents major controls and states from disappearing behind a few happy-path scenarios. Reviewing all findings before fixing them exposes shared causes and interactions, while the final full run catches changes that repair one path but weaken another.",
     note:
-      "Keep the scenario set representative and preserve failed examples. Aggregate results can hide severe misses, so require every scenario to clear the bar.",
+      "Do not copy secrets or sensitive production data into the local environment, touch production without approval, or count an untested or blocked surface as passing. Preserve the inventory, bug log, environment differences, and final evidence for review.",
     keywords: [
-      "AI product evaluation",
-      "full product testing",
-      "response scoring",
-      "quality benchmark",
-      "feature coverage",
+      "production-grade QA",
+      "production-like local testing",
+      "exhaustive product testing",
+      "real user testing",
+      "UI control coverage",
+      "edge case testing",
+      "bug documentation",
+      "full regression testing",
     ],
     related: ["quality-streak-loop", "production-data-cleanup-loop"],
   },
diff --git a/site/catalog.json b/site/catalog.json
@@ -478,28 +478,32 @@
       },
       "author": "Matthew Berman",
       "published": "2026-06-16",
-      "modified": "2026-06-17",
+      "modified": "2026-06-21",
       "description": "A comprehensive product-quality workflow that evaluates realistic scenarios across every major capability, fixes weak outcomes, and reruns them to the defined bar.",
-      "useWhen": "Use this for an end-to-end product evaluation when quality must be measured across the full feature set rather than a narrow regression or a few hand-picked examples.",
-      "prompt": "Create [N] realistic scenarios covering every major capability. Before testing, define clear success criteria and choose a consistent evaluation method, such as pass/fail checks or a scoring rubric. Run every scenario under the same conditions and record evidence for each outcome. Fix the underlying cause of anything that does not meet the criteria, rerun the affected scenarios, and then rerun the complete set. Continue until every scenario meets the original quality bar.",
+      "useWhen": "Use this for an exhaustive, end-to-end application QA pass when a production-like local environment and complete interactive-surface coverage matter more than a narrow regression or sample of major features.",
+      "prompt": "Build sanitized, production-scale local data under production-like settings. Inventory every user-facing feature, role, route, button, input, modal, state, and workflow; define documented acceptance criteria and finite risk-based edge cases for each. Test as a real user, logging every bug with reproduction evidence. Review findings for shared causes and dependencies; implement coherent fixes with regression tests, then rerun the full inventory. Stop at a clean pass or blocked handoff. Ask before production, sensitive data, or destructive actions.",
       "verification": {
-        "title": "Every one of the [N] scenarios meets the defined quality bar.",
-        "detail": "The final evaluated run covers every major capability under the original conditions."
+        "title": "Every inventoried product surface meets its documented acceptance criteria.",
+        "detail": "The final full regression run covers every inventoried surface and its finite risk-based edge cases in the production-like local environment, with each reproducible bug fixed and backed by evidence."
       },
       "steps": [
-        "List every major capability, define the success criteria and evaluation method, choose [N], and allocate realistic scenarios across the product surface.",
-        "Run the full set under consistent conditions and evaluate every outcome with evidence.",
-        "Document each scenario that misses the criteria, fix the underlying issue, and add focused regression coverage where appropriate.",
-        "Rerun affected scenarios and then the complete set until every outcome meets the original quality bar."
-      ],
-      "why": "A fixed capability map and consistent evaluation method make product quality visible across the whole system. Requiring a final complete run catches fixes that improve one scenario while weakening another.",
-      "implementationNote": "Keep the scenario set representative and preserve failed examples. Aggregate results can hide severe misses, so require every scenario to clear the bar.",
+        "Build a sanitized or synthetic production-scale local dataset, mirror safe production settings, and record unavoidable differences.",
+        "Inventory every user-facing feature, role, route, control, state, and workflow; define documented acceptance criteria and a finite risk-based edge-case set for each item.",
+        "Exercise every inventory item as a real user under its normal and defined edge-case conditions, logging each bug immediately with reproducible evidence.",
+        "Review the complete bug set for shared causes, dependencies, and conflicting fixes, then implement the smallest coherent solution with regression coverage.",
+        "Rerun affected paths and the complete inventory; stop only at a clean full pass or an explicit blocked handoff."
+      ],
+      "why": "A finite surface inventory prevents major controls and states from disappearing behind a few happy-path scenarios. Reviewing all findings before fixing them exposes shared causes and interactions, while the final full run catches changes that repair one path but weaken another.",
+      "implementationNote": "Do not copy secrets or sensitive production data into the local environment, touch production without approval, or count an untested or blocked surface as passing. Preserve the inventory, bug log, environment differences, and final evidence for review.",
       "keywords": [
-        "AI product evaluation",
-        "full product testing",
-        "response scoring",
-        "quality benchmark",
-        "feature coverage"
+        "production-grade QA",
+        "production-like local testing",
+        "exhaustive product testing",
+        "real user testing",
+        "UI control coverage",
+        "edge case testing",
+        "bug documentation",
+        "full regression testing"
       ],
       "related": [
         {
diff --git a/site/catalog.md b/site/catalog.md
@@ -94,10 +94,10 @@ URL above.
 ## 010 — [The full product evaluation loop](https://signals.forwardfuture.ai/loop-library/loops/full-product-evaluation-loop/)
 
 - Category: Evaluation
-- Use when: Use this for an end-to-end product evaluation when quality must be measured across the full feature set rather than a narrow regression or a few hand-picked examples.
-- Prompt: Create [N] realistic scenarios covering every major capability. Before testing, define clear success criteria and choose a consistent evaluation method, such as pass/fail checks or a scoring rubric. Run every scenario under the same conditions and record evidence for each outcome. Fix the underlying cause of anything that does not meet the criteria, rerun the affected scenarios, and then rerun the complete set. Continue until every scenario meets the original quality bar.
-- Verify: Every one of the [N] scenarios meets the defined quality bar. The final evaluated run covers every major capability under the original conditions.
-- Keywords: AI product evaluation, full product testing, response scoring, quality benchmark, feature coverage
+- Use when: Use this for an exhaustive, end-to-end application QA pass when a production-like local environment and complete interactive-surface coverage matter more than a narrow regression or sample of major features.
+- Prompt: Build sanitized, production-scale local data under production-like settings. Inventory every user-facing feature, role, route, button, input, modal, state, and workflow; define documented acceptance criteria and finite risk-based edge cases for each. Test as a real user, logging every bug with reproduction evidence. Review findings for shared causes and dependencies; implement coherent fixes with regression tests, then rerun the full inventory. Stop at a clean pass or blocked handoff. Ask before production, sensitive data, or destructive actions.
+- Verify: Every inventoried product surface meets its documented acceptance criteria. The final full regression run covers every inventoried surface and its finite risk-based edge cases in the production-like local environment, with each reproducible bug fixed and backed by evidence.
+- Keywords: production-grade QA, production-like local testing, exhaustive product testing, real user testing, UI control coverage, edge case testing, bug documentation, full regression testing
 - Related: [The quality streak loop](https://signals.forwardfuture.ai/loop-library/loops/quality-streak-loop/), [The production data cleanup loop](https://signals.forwardfuture.ai/loop-library/loops/production-data-cleanup-loop/)
 
 ## 011 — [The test-suite speed loop](https://signals.forwardfuture.ai/loop-library/loops/test-suite-speed-loop/)
diff --git a/site/catalog.txt b/site/catalog.txt
@@ -94,10 +94,10 @@ URL above.
 ## 010 — [The full product evaluation loop](https://signals.forwardfuture.ai/loop-library/loops/full-product-evaluation-loop/)
 
 - Category: Evaluation
-- Use when: Use this for an end-to-end product evaluation when quality must be measured across the full feature set rather than a narrow regression or a few hand-picked examples.
-- Prompt: Create [N] realistic scenarios covering every major capability. Before testing, define clear success criteria and choose a consistent evaluation method, such as pass/fail checks or a scoring rubric. Run every scenario under the same conditions and record evidence for each outcome. Fix the underlying cause of anything that does not meet the criteria, rerun the affected scenarios, and then rerun the complete set. Continue until every scenario meets the original quality bar.
-- Verify: Every one of the [N] scenarios meets the defined quality bar. The final evaluated run covers every major capability under the original conditions.
-- Keywords: AI product evaluation, full product testing, response scoring, quality benchmark, feature coverage
+- Use when: Use this for an exhaustive, end-to-end application QA pass when a production-like local environment and complete interactive-surface coverage matter more than a narrow regression or sample of major features.
+- Prompt: Build sanitized, production-scale local data under production-like settings. Inventory every user-facing feature, role, route, button, input, modal, state, and workflow; define documented acceptance criteria and finite risk-based edge cases for each. Test as a real user, logging every bug with reproduction evidence. Review findings for shared causes and dependencies; implement coherent fixes with regression tests, then rerun the full inventory. Stop at a clean pass or blocked handoff. Ask before production, sensitive data, or destructive actions.
+- Verify: Every inventoried product surface meets its documented acceptance criteria. The final full regression run covers every inventoried surface and its finite risk-based edge cases in the production-like local environment, with each reproducible bug fixed and backed by evidence.
+- Keywords: production-grade QA, production-like local testing, exhaustive product testing, real user testing, UI control coverage, edge case testing, bug documentation, full regression testing
 - Related: [The quality streak loop](https://signals.forwardfuture.ai/loop-library/loops/quality-streak-loop/), [The production data cleanup loop](https://signals.forwardfuture.ai/loop-library/loops/production-data-cleanup-loop/)
 
 ## 011 — [The test-suite speed loop](https://signals.forwardfuture.ai/loop-library/loops/test-suite-speed-loop/)
diff --git a/site/feed.xml b/site/feed.xml
@@ -114,7 +114,7 @@
     <id>https://signals.forwardfuture.ai/loop-library/loops/full-product-evaluation-loop/</id>
     <link href="https://signals.forwardfuture.ai/loop-library/loops/full-product-evaluation-loop/" />
     <published>2026-06-16T00:00:00-07:00</published>
-    <updated>2026-06-17T00:00:00-07:00</updated>
+    <updated>2026-06-21T00:00:00-07:00</updated>
     <author>
       <name>Matthew Berman</name>
     </author>
diff --git a/site/index.html b/site/index.html
@@ -1025,7 +1025,7 @@ <h3>
                 data-category="evaluation"
                 data-published="2026-06-16"
                 data-featured="true"
-                data-search="full product evaluation realistic tests test cases scenarios major features capabilities score responses results outcomes success criteria pass fail scoring rubric evidence quality bar rerun matthew berman"
+                data-search="production grade product qa full app testing production like local dataset real user every feature role route button input modal state workflow edge case bug documentation regression fix matthew berman"
               >
                 <td class="cell-loop">
                   <div class="loop-meta">
@@ -1038,17 +1038,18 @@ <h3>
                       The full product evaluation loop
                     </a>
                   </h3>
-                  <p class="loop-summary">Tests every major product capability and fixes outcomes below the quality bar.</p>
+                  <p class="loop-summary">Recreates production locally, tests every product surface, and fixes all verified bugs holistically.</p>
                   <p data-prompt>
-                    Create [N] realistic scenarios covering every major
-                    capability. Before testing, define clear success criteria
-                    and choose a consistent evaluation method, such as pass/fail
-                    checks or a scoring rubric. Run every scenario under the
-                    same conditions and record evidence for each outcome. Fix
-                    the underlying cause of anything that does not meet the
-                    criteria, rerun the affected scenarios, and then rerun the
-                    complete set. Continue until every scenario meets the
-                    original quality bar.
+                    Build sanitized, production-scale local data under
+                    production-like settings. Inventory every user-facing
+                    feature, role, route, button, input, modal, state, and
+                    workflow; define documented acceptance criteria and finite
+                    risk-based edge cases for each. Test as a real user,
+                    logging every bug with reproduction evidence. Review
+                    findings for shared causes and dependencies; implement
+                    coherent fixes with regression tests, then rerun the full
+                    inventory. Stop at a clean pass or blocked handoff. Ask
+                    before production, sensitive data, or destructive actions.
                   </p>
                 </td>
                 <td class="cell-action">
diff --git a/site/loops/full-product-evaluation-loop/index.html b/site/loops/full-product-evaluation-loop/index.html
diff --git a/site/sitemap.xml b/site/sitemap.xml
diff --git a/skills/loop-library/references/catalog.md b/skills/loop-library/references/catalog.md