diff --git a/scripts/loop-data.mjs b/scripts/loop-data.mjs index 942d8e5..e3846aa 100644 --- a/scripts/loop-data.mjs +++ b/scripts/loop-data.mjs @@ -407,37 +407,41 @@ export const loops = [ slug: "full-product-evaluation-loop", title: "The full product evaluation loop", summary: - "Tests every major product capability and fixes outcomes below the quality bar.", - seoTitle: "Full Product Evaluation Loop for AI Systems | Loop Library", + "Recreates production locally, tests every product surface, and fixes all verified bugs holistically.", + seoTitle: "Production-Grade Full Product Evaluation Loop | Loop Library", description: "A comprehensive product-quality workflow that evaluates realistic scenarios across every major capability, fixes weak outcomes, and reruns them to the defined bar.", categoryLabel: "AI product evaluation workflow", author: "Matthew Berman", published: "2026-06-16", - modified: "2026-06-17", + modified: "2026-06-21", prompt: - "Create [N] realistic scenarios covering every major capability. Before testing, define clear success criteria and choose a consistent evaluation method, such as pass/fail checks or a scoring rubric. Run every scenario under the same conditions and record evidence for each outcome. Fix the underlying cause of anything that does not meet the criteria, rerun the affected scenarios, and then rerun the complete set. Continue until every scenario meets the original quality bar.", - verifyTitle: "Every one of the [N] scenarios meets the defined quality bar.", + "Build sanitized, production-scale local data under production-like settings. Inventory every user-facing feature, role, route, button, input, modal, state, and workflow; define documented acceptance criteria and finite risk-based edge cases for each. Test as a real user, logging every bug with reproduction evidence. Review findings for shared causes and dependencies; implement coherent fixes with regression tests, then rerun the full inventory. Stop at a clean pass or blocked handoff. Ask before production, sensitive data, or destructive actions.", + verifyTitle: "Every inventoried product surface meets its documented acceptance criteria.", verifyDetail: - "The final evaluated run covers every major capability under the original conditions.", + "The final full regression run covers every inventoried surface and its finite risk-based edge cases in the production-like local environment, with each reproducible bug fixed and backed by evidence.", useWhen: - "Use this for an end-to-end product evaluation when quality must be measured across the full feature set rather than a narrow regression or a few hand-picked examples.", + "Use this for an exhaustive, end-to-end application QA pass when a production-like local environment and complete interactive-surface coverage matter more than a narrow regression or sample of major features.", steps: [ - "List every major capability, define the success criteria and evaluation method, choose [N], and allocate realistic scenarios across the product surface.", - "Run the full set under consistent conditions and evaluate every outcome with evidence.", - "Document each scenario that misses the criteria, fix the underlying issue, and add focused regression coverage where appropriate.", - "Rerun affected scenarios and then the complete set until every outcome meets the original quality bar.", + "Build a sanitized or synthetic production-scale local dataset, mirror safe production settings, and record unavoidable differences.", + "Inventory every user-facing feature, role, route, control, state, and workflow; define documented acceptance criteria and a finite risk-based edge-case set for each item.", + "Exercise every inventory item as a real user under its normal and defined edge-case conditions, logging each bug immediately with reproducible evidence.", + "Review the complete bug set for shared causes, dependencies, and conflicting fixes, then implement the smallest coherent solution with regression coverage.", + "Rerun affected paths and the complete inventory; stop only at a clean full pass or an explicit blocked handoff.", ], why: - "A fixed capability map and consistent evaluation method make product quality visible across the whole system. Requiring a final complete run catches fixes that improve one scenario while weakening another.", + "A finite surface inventory prevents major controls and states from disappearing behind a few happy-path scenarios. Reviewing all findings before fixing them exposes shared causes and interactions, while the final full run catches changes that repair one path but weaken another.", note: - "Keep the scenario set representative and preserve failed examples. Aggregate results can hide severe misses, so require every scenario to clear the bar.", + "Do not copy secrets or sensitive production data into the local environment, touch production without approval, or count an untested or blocked surface as passing. Preserve the inventory, bug log, environment differences, and final evidence for review.", keywords: [ - "AI product evaluation", - "full product testing", - "response scoring", - "quality benchmark", - "feature coverage", + "production-grade QA", + "production-like local testing", + "exhaustive product testing", + "real user testing", + "UI control coverage", + "edge case testing", + "bug documentation", + "full regression testing", ], related: ["quality-streak-loop", "production-data-cleanup-loop"], }, diff --git a/site/catalog.json b/site/catalog.json index 97580db..8a528f6 100644 --- a/site/catalog.json +++ b/site/catalog.json @@ -478,28 +478,32 @@ }, "author": "Matthew Berman", "published": "2026-06-16", - "modified": "2026-06-17", + "modified": "2026-06-21", "description": "A comprehensive product-quality workflow that evaluates realistic scenarios across every major capability, fixes weak outcomes, and reruns them to the defined bar.", - "useWhen": "Use this for an end-to-end product evaluation when quality must be measured across the full feature set rather than a narrow regression or a few hand-picked examples.", - "prompt": "Create [N] realistic scenarios covering every major capability. Before testing, define clear success criteria and choose a consistent evaluation method, such as pass/fail checks or a scoring rubric. Run every scenario under the same conditions and record evidence for each outcome. Fix the underlying cause of anything that does not meet the criteria, rerun the affected scenarios, and then rerun the complete set. Continue until every scenario meets the original quality bar.", + "useWhen": "Use this for an exhaustive, end-to-end application QA pass when a production-like local environment and complete interactive-surface coverage matter more than a narrow regression or sample of major features.", + "prompt": "Build sanitized, production-scale local data under production-like settings. Inventory every user-facing feature, role, route, button, input, modal, state, and workflow; define documented acceptance criteria and finite risk-based edge cases for each. Test as a real user, logging every bug with reproduction evidence. Review findings for shared causes and dependencies; implement coherent fixes with regression tests, then rerun the full inventory. Stop at a clean pass or blocked handoff. Ask before production, sensitive data, or destructive actions.", "verification": { - "title": "Every one of the [N] scenarios meets the defined quality bar.", - "detail": "The final evaluated run covers every major capability under the original conditions." + "title": "Every inventoried product surface meets its documented acceptance criteria.", + "detail": "The final full regression run covers every inventoried surface and its finite risk-based edge cases in the production-like local environment, with each reproducible bug fixed and backed by evidence." }, "steps": [ - "List every major capability, define the success criteria and evaluation method, choose [N], and allocate realistic scenarios across the product surface.", - "Run the full set under consistent conditions and evaluate every outcome with evidence.", - "Document each scenario that misses the criteria, fix the underlying issue, and add focused regression coverage where appropriate.", - "Rerun affected scenarios and then the complete set until every outcome meets the original quality bar." - ], - "why": "A fixed capability map and consistent evaluation method make product quality visible across the whole system. Requiring a final complete run catches fixes that improve one scenario while weakening another.", - "implementationNote": "Keep the scenario set representative and preserve failed examples. Aggregate results can hide severe misses, so require every scenario to clear the bar.", + "Build a sanitized or synthetic production-scale local dataset, mirror safe production settings, and record unavoidable differences.", + "Inventory every user-facing feature, role, route, control, state, and workflow; define documented acceptance criteria and a finite risk-based edge-case set for each item.", + "Exercise every inventory item as a real user under its normal and defined edge-case conditions, logging each bug immediately with reproducible evidence.", + "Review the complete bug set for shared causes, dependencies, and conflicting fixes, then implement the smallest coherent solution with regression coverage.", + "Rerun affected paths and the complete inventory; stop only at a clean full pass or an explicit blocked handoff." + ], + "why": "A finite surface inventory prevents major controls and states from disappearing behind a few happy-path scenarios. Reviewing all findings before fixing them exposes shared causes and interactions, while the final full run catches changes that repair one path but weaken another.", + "implementationNote": "Do not copy secrets or sensitive production data into the local environment, touch production without approval, or count an untested or blocked surface as passing. Preserve the inventory, bug log, environment differences, and final evidence for review.", "keywords": [ - "AI product evaluation", - "full product testing", - "response scoring", - "quality benchmark", - "feature coverage" + "production-grade QA", + "production-like local testing", + "exhaustive product testing", + "real user testing", + "UI control coverage", + "edge case testing", + "bug documentation", + "full regression testing" ], "related": [ { diff --git a/site/catalog.md b/site/catalog.md index 5f7b36e..ba4c73b 100644 --- a/site/catalog.md +++ b/site/catalog.md @@ -94,10 +94,10 @@ URL above. ## 010 — [The full product evaluation loop](https://signals.forwardfuture.ai/loop-library/loops/full-product-evaluation-loop/) - Category: Evaluation -- Use when: Use this for an end-to-end product evaluation when quality must be measured across the full feature set rather than a narrow regression or a few hand-picked examples. -- Prompt: Create [N] realistic scenarios covering every major capability. Before testing, define clear success criteria and choose a consistent evaluation method, such as pass/fail checks or a scoring rubric. Run every scenario under the same conditions and record evidence for each outcome. Fix the underlying cause of anything that does not meet the criteria, rerun the affected scenarios, and then rerun the complete set. Continue until every scenario meets the original quality bar. -- Verify: Every one of the [N] scenarios meets the defined quality bar. The final evaluated run covers every major capability under the original conditions. -- Keywords: AI product evaluation, full product testing, response scoring, quality benchmark, feature coverage +- Use when: Use this for an exhaustive, end-to-end application QA pass when a production-like local environment and complete interactive-surface coverage matter more than a narrow regression or sample of major features. +- Prompt: Build sanitized, production-scale local data under production-like settings. Inventory every user-facing feature, role, route, button, input, modal, state, and workflow; define documented acceptance criteria and finite risk-based edge cases for each. Test as a real user, logging every bug with reproduction evidence. Review findings for shared causes and dependencies; implement coherent fixes with regression tests, then rerun the full inventory. Stop at a clean pass or blocked handoff. Ask before production, sensitive data, or destructive actions. +- Verify: Every inventoried product surface meets its documented acceptance criteria. The final full regression run covers every inventoried surface and its finite risk-based edge cases in the production-like local environment, with each reproducible bug fixed and backed by evidence. +- Keywords: production-grade QA, production-like local testing, exhaustive product testing, real user testing, UI control coverage, edge case testing, bug documentation, full regression testing - Related: [The quality streak loop](https://signals.forwardfuture.ai/loop-library/loops/quality-streak-loop/), [The production data cleanup loop](https://signals.forwardfuture.ai/loop-library/loops/production-data-cleanup-loop/) ## 011 — [The test-suite speed loop](https://signals.forwardfuture.ai/loop-library/loops/test-suite-speed-loop/) diff --git a/site/catalog.txt b/site/catalog.txt index 5f7b36e..ba4c73b 100644 --- a/site/catalog.txt +++ b/site/catalog.txt @@ -94,10 +94,10 @@ URL above. ## 010 — [The full product evaluation loop](https://signals.forwardfuture.ai/loop-library/loops/full-product-evaluation-loop/) - Category: Evaluation -- Use when: Use this for an end-to-end product evaluation when quality must be measured across the full feature set rather than a narrow regression or a few hand-picked examples. -- Prompt: Create [N] realistic scenarios covering every major capability. Before testing, define clear success criteria and choose a consistent evaluation method, such as pass/fail checks or a scoring rubric. Run every scenario under the same conditions and record evidence for each outcome. Fix the underlying cause of anything that does not meet the criteria, rerun the affected scenarios, and then rerun the complete set. Continue until every scenario meets the original quality bar. -- Verify: Every one of the [N] scenarios meets the defined quality bar. The final evaluated run covers every major capability under the original conditions. -- Keywords: AI product evaluation, full product testing, response scoring, quality benchmark, feature coverage +- Use when: Use this for an exhaustive, end-to-end application QA pass when a production-like local environment and complete interactive-surface coverage matter more than a narrow regression or sample of major features. +- Prompt: Build sanitized, production-scale local data under production-like settings. Inventory every user-facing feature, role, route, button, input, modal, state, and workflow; define documented acceptance criteria and finite risk-based edge cases for each. Test as a real user, logging every bug with reproduction evidence. Review findings for shared causes and dependencies; implement coherent fixes with regression tests, then rerun the full inventory. Stop at a clean pass or blocked handoff. Ask before production, sensitive data, or destructive actions. +- Verify: Every inventoried product surface meets its documented acceptance criteria. The final full regression run covers every inventoried surface and its finite risk-based edge cases in the production-like local environment, with each reproducible bug fixed and backed by evidence. +- Keywords: production-grade QA, production-like local testing, exhaustive product testing, real user testing, UI control coverage, edge case testing, bug documentation, full regression testing - Related: [The quality streak loop](https://signals.forwardfuture.ai/loop-library/loops/quality-streak-loop/), [The production data cleanup loop](https://signals.forwardfuture.ai/loop-library/loops/production-data-cleanup-loop/) ## 011 — [The test-suite speed loop](https://signals.forwardfuture.ai/loop-library/loops/test-suite-speed-loop/) diff --git a/site/feed.xml b/site/feed.xml index 46357da..d91c5f9 100644 --- a/site/feed.xml +++ b/site/feed.xml @@ -114,7 +114,7 @@ https://signals.forwardfuture.ai/loop-library/loops/full-product-evaluation-loop/ 2026-06-16T00:00:00-07:00 - 2026-06-17T00:00:00-07:00 + 2026-06-21T00:00:00-07:00 Matthew Berman diff --git a/site/index.html b/site/index.html index 5ffd53c..90fe3eb 100644 --- a/site/index.html +++ b/site/index.html @@ -1025,7 +1025,7 @@

data-category="evaluation" data-published="2026-06-16" data-featured="true" - data-search="full product evaluation realistic tests test cases scenarios major features capabilities score responses results outcomes success criteria pass fail scoring rubric evidence quality bar rerun matthew berman" + data-search="production grade product qa full app testing production like local dataset real user every feature role route button input modal state workflow edge case bug documentation regression fix matthew berman" >
@@ -1038,17 +1038,18 @@

The full product evaluation loop

-

Tests every major product capability and fixes outcomes below the quality bar.

+

Recreates production locally, tests every product surface, and fixes all verified bugs holistically.

- Create [N] realistic scenarios covering every major - capability. Before testing, define clear success criteria - and choose a consistent evaluation method, such as pass/fail - checks or a scoring rubric. Run every scenario under the - same conditions and record evidence for each outcome. Fix - the underlying cause of anything that does not meet the - criteria, rerun the affected scenarios, and then rerun the - complete set. Continue until every scenario meets the - original quality bar. + Build sanitized, production-scale local data under + production-like settings. Inventory every user-facing + feature, role, route, button, input, modal, state, and + workflow; define documented acceptance criteria and finite + risk-based edge cases for each. Test as a real user, + logging every bug with reproduction evidence. Review + findings for shared causes and dependencies; implement + coherent fixes with regression tests, then rerun the full + inventory. Stop at a clean pass or blocked handoff. Ask + before production, sensitive data, or destructive actions.

diff --git a/site/loops/full-product-evaluation-loop/index.html b/site/loops/full-product-evaluation-loop/index.html index 3b8fcb3..43c5b9a 100644 --- a/site/loops/full-product-evaluation-loop/index.html +++ b/site/loops/full-product-evaluation-loop/index.html @@ -35,7 +35,7 @@ - + @@ -45,9 +45,9 @@ - + - + @@ -91,14 +91,17 @@ "url": "https://signals.forwardfuture.ai/loop-library/loops/full-product-evaluation-loop/", "mainEntityOfPage": "https://signals.forwardfuture.ai/loop-library/loops/full-product-evaluation-loop/", "datePublished": "2026-06-16", - "dateModified": "2026-06-17", + "dateModified": "2026-06-21", "articleSection": "AI product evaluation workflow", "keywords": [ - "AI product evaluation", - "full product testing", - "response scoring", - "quality benchmark", - "feature coverage" + "production-grade QA", + "production-like local testing", + "exhaustive product testing", + "real user testing", + "UI control coverage", + "edge case testing", + "bug documentation", + "full regression testing" ], "image": { "@type": "ImageObject", @@ -133,7 +136,7 @@ } - Full Product Evaluation Loop for AI Systems | Loop Library + Production-Grade Full Product Evaluation Loop | Loop Library @@ -218,7 +221,7 @@

The full product evaluation loop

class="share-action share-action-primary" type="button" data-copy-social-post - data-post-text="Try "The full product evaluation loop" from the Loop Library: Tests every major product capability and fixes outcomes below the quality bar." + data-post-text="Try "The full product evaluation loop" from the Loop Library: Recreates production locally, tests every product surface, and fixes all verified bugs holistically." data-post-url="https://signals.forwardfuture.ai/loop-library/loops/full-product-evaluation-loop/" aria-label="Copy a social post about The full product evaluation loop" > @@ -242,14 +245,14 @@

Copy the loop

Copy
-

Create [N] realistic scenarios covering every major capability. Before testing, define clear success criteria and choose a consistent evaluation method, such as pass/fail checks or a scoring rubric. Run every scenario under the same conditions and record evidence for each outcome. Fix the underlying cause of anything that does not meet the criteria, rerun the affected scenarios, and then rerun the complete set. Continue until every scenario meets the original quality bar.

+

Build sanitized, production-scale local data under production-like settings. Inventory every user-facing feature, role, route, button, input, modal, state, and workflow; define documented acceptance criteria and finite risk-based edge cases for each. Test as a real user, logging every bug with reproduction evidence. Review findings for shared causes and dependencies; implement coherent fixes with regression tests, then rerun the full inventory. Stop at a clean pass or blocked handoff. Ask before production, sensitive data, or destructive actions.

Verify / stop

-

Every one of the [N] scenarios meets the defined quality bar.

-

The final evaluated run covers every major capability under the original conditions.

+

Every inventoried product surface meets its documented acceptance criteria.

+

The final full regression run covers every inventoried surface and its finite risk-based edge cases in the production-like local environment, with each reproducible bug fixed and backed by evidence.

@@ -267,33 +270,34 @@

Every one of the [N] scenarios meets the defined quality ba
Updated
-
+

Use this when

-

Use this for an end-to-end product evaluation when quality must be measured across the full feature set rather than a narrow regression or a few hand-picked examples.

+

Use this for an exhaustive, end-to-end application QA pass when a production-like local environment and complete interactive-surface coverage matter more than a narrow regression or sample of major features.

How to run it

    -
  1. List every major capability, define the success criteria and evaluation method, choose [N], and allocate realistic scenarios across the product surface.
  2. -
  3. Run the full set under consistent conditions and evaluate every outcome with evidence.
  4. -
  5. Document each scenario that misses the criteria, fix the underlying issue, and add focused regression coverage where appropriate.
  6. -
  7. Rerun affected scenarios and then the complete set until every outcome meets the original quality bar.
  8. +
  9. Build a sanitized or synthetic production-scale local dataset, mirror safe production settings, and record unavoidable differences.
  10. +
  11. Inventory every user-facing feature, role, route, control, state, and workflow; define documented acceptance criteria and a finite risk-based edge-case set for each item.
  12. +
  13. Exercise every inventory item as a real user under its normal and defined edge-case conditions, logging each bug immediately with reproducible evidence.
  14. +
  15. Review the complete bug set for shared causes, dependencies, and conflicting fixes, then implement the smallest coherent solution with regression coverage.
  16. +
  17. Rerun affected paths and the complete inventory; stop only at a clean full pass or an explicit blocked handoff.

Why it works

-

A fixed capability map and consistent evaluation method make product quality visible across the whole system. Requiring a final complete run catches fixes that improve one scenario while weakening another.

+

A finite surface inventory prevents major controls and states from disappearing behind a few happy-path scenarios. Reviewing all findings before fixing them exposes shared causes and interactions, while the final full run catches changes that repair one path but weaken another.

Implementation note

-

Keep the scenario set representative and preserve failed examples. Aggregate results can hide severe misses, so require every scenario to clear the bar.

+

Do not copy secrets or sensitive production data into the local environment, touch production without approval, or count an untested or blocked surface as passing. Preserve the inventory, bug log, environment differences, and final evidence for review.